In [1]:
pdf_path = "/app/1932.pdf"


In [2]:
# Requires !pip install PyMuPDF, see: https://github.com/pymupdf/pymupdf
import fitz # (pymupdf, found this is better than pypdf for our use case, note: licence is AGPL-3.0, keep that in mind if you want to use any code commercially)
from tqdm.auto import tqdm # for progress bars, requires !pip install tqdm 

def text_formatter(text: str) -> str:
    """Performs minor formatting on text."""
    cleaned_text = text.replace("\n", " ").strip() # note: this might be different for each doc (best to experiment)

    # Other potential text formatting functions can go here
    return cleaned_text

# Open PDF and get lines/pages
# Note: this only focuses on text, rather than images/figures etc
def open_and_read_pdf(pdf_path: str) -> list[dict]:
    """
    Opens a PDF file, reads its text content page by page, and collects statistics.

    Parameters:
        pdf_path (str): The file path to the PDF document to be opened and read.

    Returns:
        list[dict]: A list of dictionaries, each containing the page number
        (adjusted), character count, word count, sentence count, token count, and the extracted text
        for each page.
    """
    doc = fitz.open(pdf_path)  # open a document
    pages_and_texts = []
    for page_number, page in tqdm(enumerate(doc)):  # iterate the document pages
        text = page.get_text()  # get plain text encoded as UTF-8
        text = text_formatter(text)
        pages_and_texts.append({"page_number": page_number - 0 , # adjust page numbers since our PDF starts on page 42
                                "page_char_count": len(text),
                                "page_word_count": len(text.split(" ")),
                                "page_sentence_count_raw": len(text.split(". ")),
                                "page_token_count": len(text) / 4,  # 1 token = ~4 chars, see: https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
                                "text": text})
    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)
pages_and_texts[:3]

0it [00:00, ?it/s]

[{'page_number': 0,
  'page_char_count': 43,
  'page_word_count': 19,
  'page_sentence_count_raw': 1,
  'page_token_count': 10.75,
  'text': 'Pacifica O W N E R ’ S  M A N U A L 2 0 1 7'},
 {'page_number': 1,
  'page_char_count': 1332,
  'page_word_count': 222,
  'page_sentence_count_raw': 14,
  'page_token_count': 333.0,
 {'page_number': 2,
  'page_char_count': 1798,
  'page_word_count': 786,
  'page_sentence_count_raw': 713,
  'page_token_count': 449.5,
  'text': 'TABLE OF CONTENTS SECTION PAGE 1 INTRODUCTION . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 3 2 GRAPHICAL TABLE OF CONTENTS . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 7 3 GETTING TO KNOW YOUR VEHICLE . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 13 4 GETTING TO KNOW YOUR INSTRUMENT PANEL . . . . . . . . . . . . . . . .

In [3]:
pages_and_texts_mini = pages_and_texts[328:339]
pages_and_texts_mini

[{'page_number': 328,
  'page_char_count': 1700,
  'page_word_count': 296,
  'page_sentence_count_raw': 23,
  'page_token_count': 425.0,
 {'page_number': 329,
  'page_char_count': 1914,
  'page_word_count': 315,
  'page_sentence_count_raw': 14,
  'page_token_count': 478.5,
 {'page_number': 330,
  'page_char_count': 2000,
  'page_word_count': 350,
  'page_sentence_count_raw': 24,
  'page_token_count': 500.0,
  'text': 'To Turn Off The Engine Using ENGINE START/STOP Button 1. Place the gear selector in PARK, then push and release the ENGINE START/STOP button. 2. The ignition switch will return to the OFF position. 3. If the gear selector is not in PARK, the ENGINE START/ STOP button must be held for two seconds or three short pushes in a row with the vehicle speed above 5 mph (8 km/h) before the engine will shut off. The ignition switch position will remain in the ACC position until the gear selector is in PARK and the button is pushed twice to the OFF position. If the gear selector is n

In [4]:
import pandas as pd

df = pd.DataFrame(pages_and_texts_mini)
df.head(20)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,text
0,328,1700,296,23,425.0,STARTING THE ENGINE Before starting your vehic...
1,329,1914,315,14,478.5,ENGINE START/STOP Button Functions — With Driv...
2,330,2000,350,24,500.0,To Turn Off The Engine Using ENGINE START/STOP...
3,331,1663,270,18,415.75,NOTE: • The engine block heater cord is a fact...
4,332,1340,231,13,335.0,PARK BRAKE Electric Park Brake (EPB) Your vehi...
5,333,2389,418,22,597.25,NOTE: The EPB fault lamp will illuminate if th...
6,334,1904,334,19,476.0,WARNING! (Continued) • Do not leave the key fo...
7,335,2145,360,18,536.25,Auto Park Brake The Electric Park Brake can be...
8,336,1910,329,20,477.5,"When brake service work is complete, the follo..."
9,337,1880,329,18,470.0,WARNING! (Continued) should never exit a vehic...


In [5]:
from spacy.lang.en import English 

nlp = English()

# Add a sentencizer pipeline, see https://spacy.io/api/sentencizer/ 
nlp.add_pipe("sentencizer")

for item in tqdm(pages_and_texts_mini):
    item["sentences"] = list(nlp(item["text"]).sents)
    
    # Make sure all sentences are strings
    item["sentences"] = [str(sentence) for sentence in item["sentences"]]
    
    # Count the sentences 
    item["page_sentence_count_spacy"] = len(item["sentences"])

  0%|          | 0/11 [00:00<?, ?it/s]

In [6]:
# Create a DataFrame to get stats
df = pd.DataFrame(pages_and_texts_mini)
df.describe().round(2)

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count_raw,page_token_count,page_sentence_count_spacy
count,11.0,11.0,11.0,11.0,11.0,11.0
mean,333.0,1925.18,327.27,19.0,481.3,20.09
std,3.32,299.88,50.21,3.41,74.97,3.59
min,328.0,1340.0,231.0,13.0,335.0,13.0
25%,330.5,1790.0,305.5,18.0,447.5,19.0
50%,333.0,1910.0,329.0,19.0,477.5,20.0
75%,335.5,2072.5,355.0,21.0,518.12,23.0
max,338.0,2389.0,418.0,24.0,597.25,24.0


In [7]:
# Define split size to turn groups of sentences into chunks
num_sentence_chunk_size = 10 

# Create a function that recursively splits a list into desired sizes
def split_list(input_list: list, 
               slice_size: int) -> list[list[str]]:
    """
    Splits the input_list into sublists of size slice_size (or as close as possible).

    For example, a list of 17 sentences would be split into two lists of [[10], [7]]
    """
    return [input_list[i:i + slice_size] for i in range(0, len(input_list), slice_size)]

# Loop through pages and texts and split sentences into chunks
for item in tqdm(pages_and_texts_mini):
    item["sentence_chunks"] = split_list(input_list=item["sentences"],
                                         slice_size=num_sentence_chunk_size)
    item["num_chunks"] = len(item["sentence_chunks"])

  0%|          | 0/11 [00:00<?, ?it/s]

In [8]:
import re

# Split each chunk into its own item
pages_and_chunks = []
for item in tqdm(pages_and_texts_mini):
    for sentence_chunk in item["sentence_chunks"]:
        chunk_dict = {}
        chunk_dict["page_number"] = item["page_number"]
        
        # Join the sentences together into a paragraph-like structure, aka a chunk (so they are a single string)
        joined_sentence_chunk = "".join(sentence_chunk).replace("  ", " ").strip()
        joined_sentence_chunk = re.sub(r'\.([A-Z])', r'. \1', joined_sentence_chunk) # ".A" -> ". A" for any full-stop/capital letter combo 
        chunk_dict["sentence_chunk"] = joined_sentence_chunk

        # Get stats about the chunk
        chunk_dict["chunk_char_count"] = len(joined_sentence_chunk)
        chunk_dict["chunk_word_count"] = len([word for word in joined_sentence_chunk.split(" ")])
        chunk_dict["chunk_token_count"] = len(joined_sentence_chunk) / 4 # 1 token = ~4 characters
        
        pages_and_chunks.append(chunk_dict)

# How many chunks do we have?
len(pages_and_chunks)

  0%|          | 0/11 [00:00<?, ?it/s]

27

In [9]:
df = pd.DataFrame(pages_and_chunks)
df.describe().round(2)

Unnamed: 0,page_number,chunk_char_count,chunk_word_count,chunk_token_count
count,27.0,27.0,27.0,27.0
mean,332.85,781.93,131.52,195.48
std,3.18,381.63,63.82,95.41
min,328.0,28.0,5.0,7.0
25%,330.0,501.5,83.0,125.38
50%,333.0,885.0,153.0,221.25
75%,335.5,1023.5,175.0,255.88
max,338.0,1467.0,239.0,366.75


In [10]:
# Show random chunks with under 30 tokens in length
min_token_length = 30
for row in df[df["chunk_token_count"] <= min_token_length].iterrows():
    print(f'Chunk token count: {row[1]["chunk_token_count"]} | Text: {row[1]["sentence_chunk"]}')

Chunk token count: 7.0 | Text: 6 STARTING AND OPERATING 333


In [11]:
pages_and_chunks_over_min_token_len = df[df["chunk_token_count"] > min_token_length].to_dict(orient="records")
pages_and_chunks_over_min_token_len[:2]

[{'page_number': 328,
  'chunk_char_count': 909,
  'chunk_word_count': 156,
  'chunk_token_count': 227.25},
 {'page_number': 328,
  'sentence_chunk': 'Interior heat build-up may cause serious injury or death. Start the engine with the gear selector in the NEUTRAL or PARK position. Apply the brake before shifting to any driving range. Normal Starting NOTE: Normal starting of either a cold or a warm engine is obtained without pumping or pressing the accelerator pedal. To Turn On The Engine Using ENGINE START/STOP Button 1. The transmission must be in PARK or NEUTRAL.2. Press and hold the brake pedal while pushing the ENGINE START/STOP button once.3. The system takes over and attempts to start the vehicle.',
  'chunk_char_count': 561,
  'chunk_word_count': 94,
  'chunk_token_count': 140.25}]

In [12]:
# Requires !pip install sentence-transformers
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", 
                                      device="cuda") # choose the device to load the model to (note: GPU will often be *much* faster than CPU)



In [13]:
# Uncomment to see how long it takes to create embeddings on CPU
# # Make sure the model is on the CPU
embedding_model.to("cuda")

# Embed each chunk one by one
for item in tqdm(pages_and_chunks_over_min_token_len):
    item["embedding"] = embedding_model.encode(item["sentence_chunk"])

  0%|          | 0/26 [00:00<?, ?it/s]

In [14]:
# Turn text chunks into a single list
text_chunks = [item["sentence_chunk"] for item in pages_and_chunks_over_min_token_len]

In [15]:
# Embed all texts in batches
text_chunk_embeddings = embedding_model.encode(text_chunks,
                                               batch_size=32, # you can use different batch sizes here for speed/performance, I found 32 works well for this use case
                                               convert_to_tensor=True) # optional to return embeddings as tensor instead of array

text_chunk_embeddings

tensor([[-3.0038e-02, -6.9050e-02,  2.1764e-03,  ..., -3.1497e-02,
         -2.8311e-02,  1.1720e-02],
        [-8.4264e-02, -6.7062e-02,  3.6071e-03,  ..., -2.2037e-02,
         -7.5465e-02,  1.5554e-02],
        [-6.7777e-02, -1.4139e-02,  2.8875e-02,  ...,  3.8044e-02,
         -3.1119e-02,  3.5510e-02],
        ...,
        [-5.1009e-02, -4.1736e-02, -1.8790e-02,  ..., -3.5151e-02,
         -5.6504e-02,  2.7649e-02],
        [-2.0146e-02, -1.0817e-01, -1.8390e-02,  ..., -6.9364e-03,
         -4.6984e-02,  2.6773e-02],
        [-3.9979e-02, -5.3506e-02,  5.3694e-03,  ..., -6.2039e-05,
         -2.2721e-02,  3.2149e-02]], device='cuda:0')

In [16]:
# Save embeddings to file
text_chunks_and_embeddings_df = pd.DataFrame(pages_and_chunks_over_min_token_len)
embeddings_df_save_path = "mini_text_chunks_and_embeddings_df.csv"
text_chunks_and_embeddings_df.to_csv(embeddings_df_save_path, index=False)

In [17]:
import random

import torch
import numpy as np 
import pandas as pd

device = "cuda" if torch.cuda.is_available() else "cpu"
torch.cuda.empty_cache()


# Import texts and embedding df
text_chunks_and_embedding_df = pd.read_csv("mini_text_chunks_and_embeddings_df.csv")

# Convert embedding column back to np.array (it got converted to string when it got saved to CSV)
text_chunks_and_embedding_df["embedding"] = text_chunks_and_embedding_df["embedding"].apply(lambda x: np.fromstring(x.strip("[]"), sep=" "))

# Convert texts and embedding df to list of dicts
pages_and_chunks = text_chunks_and_embedding_df.to_dict(orient="records")

# Convert embeddings to torch tensor and send to device (note: NumPy arrays are float64, torch tensors are float32 by default)
embeddings = torch.tensor(np.array(text_chunks_and_embedding_df["embedding"].tolist()), dtype=torch.float32).to(device)
embeddings.shape

torch.Size([26, 768])

In [18]:
embeddings[0]

tensor([-3.0038e-02, -6.9050e-02,  2.1764e-03,  9.7545e-03,  5.4615e-02,
         2.0647e-02,  4.4747e-02, -4.1528e-02, -1.3413e-02, -5.3229e-02,
         3.8800e-03,  5.6261e-04,  1.7737e-02, -2.1511e-02, -3.9430e-02,
         4.3398e-03, -2.3517e-02, -3.8912e-02, -5.3455e-02, -9.5492e-03,
        -2.2950e-02,  2.0181e-02, -2.6534e-02, -9.1144e-03, -5.7861e-02,
        -5.0820e-02,  2.8594e-02,  2.0955e-02,  6.3897e-02, -6.0453e-02,
         1.6675e-02,  1.4319e-02,  2.4334e-02,  3.3872e-03,  1.8682e-06,
        -9.2624e-03,  7.3216e-03, -1.0135e-02,  5.6885e-03, -1.6288e-02,
         1.1772e-02,  4.1039e-02,  4.7015e-02,  3.4457e-02, -4.9442e-03,
        -4.9761e-02,  2.8867e-02,  5.6088e-02,  4.7971e-02, -7.6803e-03,
        -3.6312e-02, -1.5688e-03, -5.0419e-02,  7.6028e-02, -1.4526e-02,
        -2.6977e-02, -4.4065e-04,  9.5101e-03,  3.0835e-02, -5.6715e-02,
         1.6736e-02, -2.8949e-02,  3.6877e-02, -4.8995e-02,  4.5350e-02,
         2.9658e-02, -1.1218e-02, -7.1507e-02, -3.1

In [19]:
from sentence_transformers import util

# 1. Define the query
# Note: This could be anything. But since we're working with a nutrition textbook, we'll stick with nutrition-based queries.
query = "Parking break function"
print(f"Query: {query}")

# 2. Embed the query to the same numerical space as the text examples 
# Note: It's important to embed your query with the same model you embedded your examples with.
query_embedding = embedding_model.encode(query, convert_to_tensor=True)

# 3. Get similarity scores with the dot product (we'll time this for fun)
from time import perf_counter as timer

start_time = timer()
dot_scores = util.dot_score(a=query_embedding, b=embeddings)[0]
end_time = timer()

print(f"Time take to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

# 4. Get the top-k results (we'll keep this to 5)
top_results_dot_product = torch.topk(dot_scores, k=5)
top_results_dot_product

Query: Parking break function
Time take to get scores on 26 embeddings: 0.03067 seconds.


torch.return_types.topk(
values=tensor([0.6279, 0.5955, 0.5805, 0.5785, 0.5693], device='cuda:0'),
indices=tensor([13, 19, 10, 16, 17], device='cuda:0'))

In [20]:
# Define helper function to print wrapped text 
import textwrap

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

print(f"Query: '{query}'\n")
print("Results:")
# Loop through zipped together scores and indicies from torch.topk
for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
    print(f"Score: {score:.4f}")
    # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
    print("Text:")
    print_wrapped(pages_and_chunks[idx]["sentence_chunk"])
    # Print the page number too so we can reference the textbook further (and check the results)
    print(f"Page number: {pages_and_chunks[idx]['page_number']}")
    print("\n")

Query: 'Parking break function'

Results:
Score: 0.6279
Text:
instrument cluster and the LED indicator on the switch will extinguish. NOTE:
When parking on a hill, it is important to turn the front wheels toward the curb
on a downhill grade and away from the curb on an uphill grade. Apply the park
brake before placing the gear selector in PARK, otherwise the load on the
transmission locking mechanism may make it difficult to move the gear selector
out of PARK. The park brake should always be applied whenever the driver is not
parking brake. Always apply the parking brake fully when parked to guard against
vehicle movement and possible injury or damage. •When exiting the vehicle,
always make sure the ignition is in the OFF mode, remove the key fob from the
vehicle, and lock your vehicle. •Never leave children alone in a vehicle, or
with access to an unlocked vehicle. Allowing children to be in a vehicle
unattended is dangerous for a number of reasons.
Page number: 333


Score: 0.5955
Te

In [21]:
def retrieve_relevant_resources(query: str,
                                embeddings: torch.tensor,
                                model: SentenceTransformer=embedding_model,
                                n_resources_to_return: int=5,
                                print_time: bool=True):
    """
    Embeds a query with model and returns top k scores and indices from embeddings.
    """

    # Embed the query
    query_embedding = model.encode(query, convert_to_tensor=True) 

    # Get dot product scores on embeddings
    start_time = timer()
    #dot_scores = util.dot_score(query_embedding, embeddings)[0]
    dot_scores = util.cos_sim(query_embedding, embeddings)[0]    
    
    end_time = timer()

    if print_time:
        print(f"[INFO] Time taken to get scores on {len(embeddings)} embeddings: {end_time-start_time:.5f} seconds.")

    scores, indices = torch.topk(input=dot_scores, 
                                 k=n_resources_to_return)

    return scores, indices

def print_top_results_and_scores(query: str,
                                 embeddings: torch.tensor,
                                 pages_and_chunks: list[dict]=pages_and_chunks,
                                 n_resources_to_return: int=5):
    """
    Takes a query, retrieves most relevant resources and prints them out in descending order.

    Note: Requires pages_and_chunks to be formatted in a specific way (see above for reference).
    """
    
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings,
                                                  n_resources_to_return=n_resources_to_return)
    
    print(f"Query: {query}\n")
    print("Results:")
    # Loop through zipped together scores and indicies
    for score, index in zip(scores, indices):
        print(f"Score: {score:.4f}")
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        print_wrapped(pages_and_chunks[index]["sentence_chunk"])
        # Print the page number too so we can reference the textbook further and check the results
        print(f"Page number: {pages_and_chunks[index]['page_number']}")
        print("\n")

In [22]:
query = "Parking break function"

# Print out the texts of the top scores
print_top_results_and_scores(query=query, embeddings=embeddings)

[INFO] Time taken to get scores on 26 embeddings: 0.00057 seconds.
Query: Parking break function

Results:
Score: 0.6279
instrument cluster and the LED indicator on the switch will extinguish. NOTE:
When parking on a hill, it is important to turn the front wheels toward the curb
on a downhill grade and away from the curb on an uphill grade. Apply the park
brake before placing the gear selector in PARK, otherwise the load on the
transmission locking mechanism may make it difficult to move the gear selector
out of PARK. The park brake should always be applied whenever the driver is not
parking brake. Always apply the parking brake fully when parked to guard against
vehicle movement and possible injury or damage. •When exiting the vehicle,
always make sure the ignition is in the OFF mode, remove the key fob from the
vehicle, and lock your vehicle. •Never leave children alone in a vehicle, or
with access to an unlocked vehicle. Allowing children to be in a vehicle
unattended is dangerous f

In [25]:
# Get GPU available memory
import torch
gpu_memory_bytes = torch.cuda.get_device_properties(0).total_memory
gpu_memory_gb = round(gpu_memory_bytes / (2**30))
print(f"Available GPU memory: {gpu_memory_gb} GB")

# Note: the following is Gemma focused, however, there are more and more LLMs of the 2B and 7B size appearing for local use.
if gpu_memory_gb < 5.1:
    print(f"Your available GPU memory is {gpu_memory_gb}GB, you may not have enough memory to run a Gemma LLM locally without quantization.")
elif gpu_memory_gb < 8.1:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in 4-bit precision.")
    use_quantization_config = True 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb < 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommended model: Gemma 2B in float16 or Gemma 7B in 4-bit precision.")
    use_quantization_config = False 
    model_id = "google/gemma-2b-it"
elif gpu_memory_gb > 19.0:
    print(f"GPU memory: {gpu_memory_gb} | Recommend model: Gemma 7B in 4-bit or float16 precision.")
    use_quantization_config = False 
    model_id = "google/gemma-7b-it"

print(f"use_quantization_config set to: {use_quantization_config}")
print(f"model_id set to: {model_id}")

Available GPU memory: 8 GB
GPU memory: 8 | Recommended model: Gemma 2B in 4-bit precision.
use_quantization_config set to: True
model_id set to: google/gemma-2b-it


In [26]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers.utils import is_flash_attn_2_available 

# 1. Create quantization config for smaller model loading (optional)
# Requires !pip install bitsandbytes accelerate, see: https://github.com/TimDettmers/bitsandbytes, https://huggingface.co/docs/accelerate/
# For models that require 4-bit quantization (use this if you have low GPU memory available)
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(load_in_4bit=True,
                                         bnb_4bit_compute_dtype=torch.float16)

# Bonus: Setup Flash Attention 2 for faster inference, default to "sdpa" or "scaled dot product attention" if it's not available
# Flash Attention 2 requires NVIDIA GPU compute capability of 8.0 or above, see: https://developer.nvidia.com/cuda-gpus
# Requires !pip install flash-attn, see: https://github.com/Dao-AILab/flash-attention 
if (is_flash_attn_2_available()) and (torch.cuda.get_device_capability(0)[0] >= 8):
  attn_implementation = "flash_attention_2"
else:
  attn_implementation = "sdpa"
print(f"[INFO] Using attention implementation: {attn_implementation}")

# 2. Pick a model we'd like to use (this will depend on how much GPU memory you have available)
#model_id = "google/gemma-7b-it"
model_id = 'google/gemma-2b-it' # (we already set this above)
print(f"[INFO] Using model_id: {model_id}")

# 3. Instantiate tokenizer (tokenizer turns text into numbers ready for the model) 
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=model_id)

# 4. Instantiate the model
llm_model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path=model_id, 
                                                 torch_dtype=torch.float16, # datatype to use, we want float16
                                                 quantization_config=quantization_config if use_quantization_config else None,
                                                 low_cpu_mem_usage=False, # use full memory 
                                                 attn_implementation=attn_implementation) # which attention version to use

if not use_quantization_config: # quantization takes care of device setting automatically, so if it's not used, send model to GPU 
    llm_model.to("cuda")

llm_model

[INFO] Using attention implementation: sdpa
[INFO] Using model_id: google/gemma-2b-it




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [28]:
#How about we get the number of parameters in our model? 
def get_model_num_params(model: torch.nn.Module):
    return sum([param.numel() for param in model.parameters()])

get_model_num_params(llm_model)

1515268096

In [29]:
#How about we get the models memory requirements?

def get_model_mem_size(model: torch.nn.Module):
    """
    Get how much memory a PyTorch model takes up.

    See: https://discuss.pytorch.org/t/gpu-memory-that-model-uses/56822
    """
    # Get model parameters and buffer sizes
    mem_params = sum([param.nelement() * param.element_size() for param in model.parameters()])
    mem_buffers = sum([buf.nelement() * buf.element_size() for buf in model.buffers()])

    # Calculate various model sizes
    model_mem_bytes = mem_params + mem_buffers # in bytes
    model_mem_mb = model_mem_bytes / (1024**2) # in megabytes
    model_mem_gb = model_mem_bytes / (1024**3) # in gigabytes

    return {"model_mem_bytes": model_mem_bytes,
            "model_mem_mb": round(model_mem_mb, 2),
            "model_mem_gb": round(model_mem_gb, 2)}

get_model_mem_size(llm_model)

{'model_mem_bytes': 2106740736, 'model_mem_mb': 2009.14, 'model_mem_gb': 1.96}

In [30]:
input_text = "What is EPB, How it is different?"
print(f"Input text:\n{input_text}")

# Create prompt template for instruction-tuned model
dialogue_template = [
    {"role": "user",
     "content": input_text}
]

# Apply the chat template
prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                       tokenize=False, # keep as raw text (not tokenized)
                                       add_generation_prompt=True)
print(f"\nPrompt (formatted):\n{prompt}")

Input text:
What is EPB, How it is different?

Prompt (formatted):
<bos><start_of_turn>user
What is EPB, How it is different?<end_of_turn>
<start_of_turn>model



In [31]:
%%time

# Tokenize the input text (turn it into numbers) and send it to GPU
input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")
print(f"Model input (tokenized):\n{input_ids}\n")

# Generate outputs passed on the tokenized input
# See generate docs: https://huggingface.co/docs/transformers/v4.38.2/en/main_classes/text_generation#transformers.GenerationConfig 
outputs = llm_model.generate(**input_ids,
                             max_new_tokens=256) # define the maximum number of new tokens to create
print(f"Model output (tokens):\n{outputs[0]}\n")

Model input (tokenized):
{'input_ids': tensor([[     2,      2,    106,   1645,    108,   1841,    603,  19235, 235305,
         235269,   2250,    665,    603,   2167, 235336,    107,    108,    106,
           2516,    108]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0')}

Model output (tokens):
tensor([     2,      2,    106,   1645,    108,   1841,    603,  19235, 235305,
        235269,   2250,    665,    603,   2167, 235336,    107,    108,    106,
          2516,    108,    688,   8239, 235305,    591, 136577,   1163,   6649,
        211363,  77056,    603,    476,   6178,    674,   8563,    692,    577,
          9616,   6733,   9630,    577,    476,   3821,   2384,  26513, 235265,
          1165, 235303, 235256,    476,  11137,    576,    476,   2384,  26513,
           578,    671,  56973,   3530, 235269,    832,    575,    974, 235265,
           109,    688,   2299,    665, 235303, 235256, 

In [32]:
# Decode the output tokens to text
outputs_decoded = tokenizer.decode(outputs[0])
print(f"Model output (decoded):\n{outputs_decoded}\n")

Model output (decoded):
<bos><bos><start_of_turn>user
What is EPB, How it is different?<end_of_turn>
<start_of_turn>model
**EPB (Ethernet over Power Budgeting)** is a technology that allows you to connect multiple devices to a single power outlet. It's a combination of a power outlet and an Ethernet port, all in one.

**How it's different:**

* **Traditional power outlets** require a separate power cord for each device.
* **EPB** uses a single power cord and an Ethernet cable to connect multiple devices to the same outlet.
* **EPB** is typically used in commercial and industrial settings, where it can save money and space.
* **EPB** is also becoming increasingly popular in homes, as it can be used to connect a variety of devices, including computers, printers, and other appliances.

**Benefits of EPB:**

* **Reduced power costs:** EPB can save you money on your electricity bill by allowing you to power multiple devices from a single outlet.
* **Space savings:** EPB can save you valuabl

In [82]:
gpt4_questions = [
    "when to apply parking brake?",
    "what does Brake Transmission Shift Interlock system do?",
    # "How often should you check and replace your air filter?",
    # "What is the purpose of the coolant system, and how can you tell if it needs servicing?",
    # "How do you know when your brake pads need to be replaced?",
    # "What should you do if your check engine light comes on?"
]

# Manually created additional questions
manual_questions = [
    "how to reset the parking brake system ?",
    "How do you jump-start a car safely?",
    # "What are the benefits of regular tire rotation?",
    # "How often should you replace your windshield wipers?",
    # "What fluids should be checked regularly for proper vehicle operation?",
    # "Does Chrysler Pachifica has EPB?"
]


query_list = gpt4_questions + manual_questions

import random
query = random.choice(query_list)

print(f"Query: {query}")

# Get just the scores and indices of top related results
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
scores, indices

Query: what does Brake Transmission Shift Interlock system do?
[INFO] Time taken to get scores on 26 embeddings: 0.00080 seconds.


(tensor([0.7637, 0.6356, 0.5866, 0.5624, 0.5478], device='cuda:0'),
 tensor([24, 23, 17, 25, 12], device='cuda:0'))

In [83]:
def prompt_formatter(query: str, 
                     context_items: list[dict]) -> str:
    """
    Augments query with text-based context from context_items.
    """
    # Join context items into one dotted paragraph
    context = "- " + "\n- ".join([item["sentence_chunk"] for item in context_items])

    # Create a base prompt with examples to help the model
    # Note: this is very customizable, I've chosen to use 3 examples of the answer style we'd like.
    # We could also write this in a txt file and import it in if we wanted.
    base_prompt = """Based on the following context items, please answer the query.
    Give yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are as explanatory as possible.
    Use the following examples as reference for the ideal answer style.
    \nExample:
    Query: What are the signs that your car needs an oil change?
    Answer:  The signs that your car needs an oil change include a warning light on the dashboard, dark and gritty oil, unusual engine noises, decreased fuel efficiency, and a burnt smell from the engine. Regularly checking the oil level and its condition can help determine the right time for an oil change, typically every 5,000 to 7,500 miles..
    \nNow use the following context items to answer the user query:
    {context}
    \nRelevant passages: <extract relevant passages from the context here>
    User query: {query}
    Answer:"""


    # Update base prompt with context items and query   
    base_prompt = base_prompt.format(context=context, query=query)

    # Create prompt template for instruction-tuned model
    dialogue_template = [
        {"role": "user",
        "content": base_prompt}
    ]

    # Apply the chat template
    prompt = tokenizer.apply_chat_template(conversation=dialogue_template,
                                          tokenize=False,
                                          add_generation_prompt=True)
    return prompt

In [115]:
query = random.choice(query_list)
print(f"Query: {query}")

# Get relevant resources
scores, indices = retrieve_relevant_resources(query=query,
                                              embeddings=embeddings)
    
# Create a list of context items
context_items = [pages_and_chunks[i] for i in indices]

# Format prompt with context items
prompt = prompt_formatter(query=query,
                          context_items=context_items)
print(prompt)

Query: what does Brake Transmission Shift Interlock system do?
[INFO] Time taken to get scores on 26 embeddings: 0.00052 seconds.
<bos><start_of_turn>user
Based on the following context items, please answer the query.
    Give yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are as explanatory as possible.
    Use the following examples as reference for the ideal answer style.
    
Example:
    Query: What are the signs that your car needs an oil change?
    
Now use the following context items to answer the user query:
    - Brake/Transmission Shift Interlock System This vehicle is equipped with a Brake Transmission Shift Interlock system (BTSI) that holds the transmission gear selector in PARK unless the brakes are applied. To shift the transmission out of PARK, the engine must be running and the brake pedal must be pressed. The brake pedal must also be

In [116]:
%%time

input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

# Generate an output of tokens
outputs = llm_model.generate(**input_ids,
                             temperature=0.7, # lower temperature = more deterministic outputs, higher temperature = more creative outputs
                             do_sample=True, # whether or not to use sampling, see https://huyenchip.com/2024/01/16/sampling.html for more
                             max_new_tokens=256) # how many new tokens to generate from prompt 

# Turn the output tokens into text
output_text = tokenizer.decode(outputs[0])
# Replace special tokens and unnecessary help message
output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

print(f"Query: {query}")
print(f"RAG answer:\n{output_text.replace(prompt, '')}")

Query: what does Brake Transmission Shift Interlock system do?
RAG answer:
Sure, here's the answer to the user query:

The Brake Transmission Shift Interlock system (BTSI) prevents the vehicle from shifting gears while the engine is running and the parking brake is not engaged. This system ensures that the transmission is in Park before the vehicle can be shifted into any gear.
CPU times: user 6.02 s, sys: 500 ms, total: 6.52 s
Wall time: 6.52 s


In [98]:
def ask(query, 
        temperature=0.7,
        max_new_tokens=512,
        format_answer_text=True, 
        return_answer_only=True):
    """
    Takes a query, finds relevant resources/context and generates an answer to the query based on the relevant resources.
    """
    
    # Get just the scores and indices of top related results
    scores, indices = retrieve_relevant_resources(query=query,
                                                  embeddings=embeddings)
    
    # Create a list of context items
    context_items = [pages_and_chunks[i] for i in indices]

    # Add score to context item
    print('context_items:')
    for i, item in enumerate(context_items):
        item["score"] = scores[i].cpu() # return score back to CPU 
        #print(item['sentence_chunk'])
        
        
    # Format the prompt with context items
    prompt = prompt_formatter(query=query,
                              context_items=context_items)
    
    print("promt :\n", prompt)
    
    # Tokenize the prompt
    input_ids = tokenizer(prompt, return_tensors="pt").to("cuda")

    # Generate an output of tokens
    outputs = llm_model.generate(**input_ids,
                                 temperature=temperature,
                                 do_sample=True,
                                 max_new_tokens=max_new_tokens)
    
    # Turn the output tokens into text
    output_text = tokenizer.decode(outputs[0])

    if format_answer_text:
        # Replace special tokens and unnecessary help message
        output_text = output_text.replace(prompt, "").replace("<bos>", "").replace("<eos>", "").replace("Sure, here is the answer to the user query:\n\n", "")

    # Only return the answer without the context items
    if return_answer_only:
        return output_text
    
    return output_text, context_items

In [117]:
#query = random.choice(query_list)
query = "what does Brake Transmission Shift Interlock system do?"

print(f"Query: {query}")

# Answer query with context and return context 
answer, context_items = ask(query=query, 
                            temperature=0.7,
                            max_new_tokens=512,
                            return_answer_only=False)

print(f"Answer:\n")
print_wrapped(answer)
print(f"Context items:")


Query: what does Brake Transmission Shift Interlock system do?
[INFO] Time taken to get scores on 26 embeddings: 0.00094 seconds.
context_items:
promt :
 <bos><start_of_turn>user
Based on the following context items, please answer the query.
    Give yourself room to think by extracting relevant passages from the context before answering the query.
    Don't return the thinking, only return the answer.
    Make sure your answers are as explanatory as possible.
    Use the following examples as reference for the ideal answer style.
    
Example:
    Query: What are the signs that your car needs an oil change?
    
Now use the following context items to answer the user query:
    - Brake/Transmission Shift Interlock System This vehicle is equipped with a Brake Transmission Shift Interlock system (BTSI) that holds the transmission gear selector in PARK unless the brakes are applied. To shift the transmission out of PARK, the engine must be running and the brake pedal must be pressed. The 