In [None]:
# building a RAG(Retrieval Augmented Generation) pipeline from scratch 


In [1]:

# installing and verifying torch manually

# pip3 install torch torchvision



In [2]:
import torch

In [3]:
var = torch.rand(10,20)

In [4]:
var

tensor([[0.5896, 0.7405, 0.4318, 0.6794, 0.9521, 0.0934, 0.5035, 0.6383, 0.9757,
         0.0490, 0.4231, 0.7973, 0.2975, 0.6313, 0.7023, 0.0090, 0.2655, 0.3913,
         0.9591, 0.0300],
        [0.7601, 0.4640, 0.7692, 0.7797, 0.5399, 0.7324, 0.4939, 0.7826, 0.3750,
         0.9877, 0.9441, 0.4140, 0.7683, 0.5377, 0.3388, 0.7754, 0.4934, 0.4358,
         0.1821, 0.7938],
        [0.2906, 0.1608, 0.7488, 0.0713, 0.2377, 0.1399, 0.2662, 0.4450, 0.2013,
         0.1799, 0.0763, 0.8409, 0.4124, 0.5131, 0.3840, 0.9787, 0.8420, 0.4047,
         0.7416, 0.5673],
        [0.9478, 0.5520, 0.1299, 0.4541, 0.6727, 0.2297, 0.4384, 0.3955, 0.8875,
         0.9005, 0.2569, 0.8001, 0.6537, 0.5904, 0.8617, 0.7072, 0.4580, 0.2755,
         0.7071, 0.8985],
        [0.4682, 0.7900, 0.5041, 0.7696, 0.0892, 0.6577, 0.5213, 0.9373, 0.6070,
         0.1511, 0.6237, 0.2077, 0.2918, 0.7980, 0.5563, 0.3488, 0.6350, 0.9124,
         0.6287, 0.8722],
        [0.4791, 0.2203, 0.7109, 0.3883, 0.6416, 0.2433, 0.3

In [None]:
# RAG takes information from one place and pass it to LLM so it can generate outputs based on that information

In [5]:
### What is RAG ?

In [None]:
# Retrieval - Find relevant information given a query e.g "what are macro nutrients and what do they do"? -> this retieves passage of text relevant 
# to macro nutrients from the text book (pdf source which we have added here)

# Augmented - we want to take relevant information and augment our input(prompt) to an LLM with relevant information.

# Generation - Take the first two steps and pass it to the LLM for generative outputs.

In [None]:

# Main goal of RAG is to improve generation of outputs in LLM.

# 1. prevent hallucination - LLM are good at generating good looking text. but it doesn't mean that the data which we retrieved are factual.
# RAG can help LLMs generate information based on relevant passages that are factual.

# 2. Work with custom data - LLM's are trained with internet-scale data. 
# which means lots of the responses are generic.
# RAG helps to creeate specific responses based on specific documents (e.g your own companies support documents)




In [6]:
### How RAG can be useful ?


# customer support Q&A -> Treat the sources and when some one asks a specific question it should be able to retrieve a 
# relevant information / passage and have an LLM to craft those snippets into an answer/ ex: chatbot for your documentation

# Email chain analysis -> take some unstructured data from chain of emails and have an LLM to transform into structured data

# company internal documentation chat 

# Textbook q&a



# common : take your relevant documents to a query and process them with LLMs

# we can consider LLM as a calculator for words


In [7]:
### why we need local RAG ?

# privacy, speed, cost not dependant on vendors like open AI

In [None]:
# # workflow

# pdf -> pre process the text to smaller chunks -> smaller chunks (thi is our context) -> embedding model (turns text/query into numerical respresntation) ->
# store in pytorch tesnsor 


# RAG paper -> https://arxiv.org/pdf/2312.10997.pdf



In [8]:

# this is called document preprocessing and embedding creation

# open a pdf document
# format the text of pdf ready for an embedding model (in this case its called as chunks)
# embed all of the chunks and turn it into a numerical representation which can store for later


# this is for search and answer


# build a retrieval system that uses vector search to find the relevant chunk of text based on a query
# create a prompt that incorporates the retrieved pieces of text
# generate an answer to the query based on the passages of the text book with LLM

In [None]:


### document preprocessing and embedding creation




In [None]:
# Ingredients :
# pdf document of choice
# embedding model of choice

# steps :
# import pdf document
# process text for embedding (split into chunk of sentences)
# embed text chunks with embedding model
# save embedding to file for later

In [29]:
import os
import requests


# get the pdf document path 

pdf_path = "Human-Nutrition-2020-Edition-1598491699.pdf"

# check
print(os.path.exists(pdf_path))

# download PDF

if not os.path.exists(pdf_path):
    print("doesn't exist...... downloding")
    url = "https://pressbooks.oer.hawaii.edu/humannutrition2/open/download?type=pdf"
    
    #the local filename to save the downloaded file 
    filename = pdf_path

    #sending a get request to the URL
    response = requests.get(url)


    # check if the request is succesfull
    if response.status_code == 200:
        # open the file and save it
        with open(filename, "wb") as file:
            file.write(response.content)
        print(f"file has been downloaded and saved as {filename}")

    else:
        print(f"{response.status_code}")

else:
    print(f"file {pdf_path} exists") 


True
file Human-Nutrition-2020-Edition-1598491699.pdf exists


# Opening the PDF

In [45]:
import fitz # pip install pyMuPdf
from tqdm.auto import tqdm


# small helper / text formatting function to process the pdf

def text_formatter(text: str) -> str:
    """ performs minor formatting on text. """
    cleaned_text = text.replace("\n", " ").strip()


    # more text formatting can go here


    return cleaned_text


def open_and_read_pdf(pdf_path: str) -> list[dict]:
    doc = fitz.open(pdf_path)
    # empty list
    pages_and_texts = []

    for page_number, page in tqdm(enumerate(doc)):
        text = page.get_text()
        text = text_formatter(text=text)
        pages_and_texts.append({"page_number": page_number - 41,
        "page_char_count": len(text),
        "page_word_count": len(text.split(" ")),
        "page_sentence_count": len(text.split(". ")),
        "page_token_count": len(text)/4, # 1 token ~ 4 characters
        "text":text})


    return pages_and_texts

pages_and_texts = open_and_read_pdf(pdf_path=pdf_path)

pages_and_texts[:20]

0it [00:00, ?it/s]

[{'page_number': -41,
  'page_char_count': 29,
  'page_word_count': 4,
  'page_sentence_count': 1,
  'page_token_count': 7.25,
  'text': 'Human Nutrition: 2020 Edition'},
 {'page_number': -40,
  'page_char_count': 0,
  'page_word_count': 1,
  'page_sentence_count': 1,
  'page_token_count': 0.0,
  'text': ''},
 {'page_number': -39,
  'page_char_count': 320,
  'page_word_count': 54,
  'page_sentence_count': 1,
  'page_token_count': 80.0,
  'text': 'Human Nutrition: 2020  Edition  UNIVERSITY OF HAWAI‘I AT MĀNOA  FOOD SCIENCE AND HUMAN  NUTRITION PROGRAM  ALAN TITCHENAL, SKYLAR HARA,  NOEMI ARCEO CAACBAY, WILLIAM  MEINKE-LAU, YA-YUN YANG, MARIE  KAINOA FIALKOWSKI REVILLA,  JENNIFER DRAPER, GEMADY  LANGFELDER, CHERYL GIBBY, CHYNA  NICOLE CHUN, AND ALLISON  CALABRESE'},
 {'page_number': -38,
  'page_char_count': 212,
  'page_word_count': 32,
  'page_sentence_count': 1,
  'page_token_count': 53.0,
  'text': 'Human Nutrition: 2020 Edition by University of Hawai‘i at Mānoa Food Science and  Hum

In [59]:
# choosing a page random

import random

random.sample(pages_and_texts, k=3)

[{'page_number': 273,
  'page_char_count': 1298,
  'page_word_count': 316,
  'page_sentence_count': 10,
  'page_token_count': 324.5,
  'text': 'Foods  Total  Carbohydrates  Sugars Fiber Added  Sugars  Banana  27 (1 medium)  14.40  3.1  0  Lentils  40 (1 c.)  3.50  16.0  0  Snap beans  8.7 (1 c.)  1.60  4.0  0  Green pepper  5.5 (1 medium)  2.90  2.0  0  Corn tortilla  10.7 (1)  0.20  1.5  0  Bread, wheat bran  17.2 (1 slice)  3.50  1.4  3.4  Bread, rye  15.5 (1 slice)  1.20  1.9  1.0  Bagel (plain)  53 (1 medium)  5.30  2.3  4.8  Brownie  36 (1 square)  20.50  1.2  20.0  Oatmeal cookie  22.3 (1 oz.)  12.00  2.0  7.7  Cornflakes  23 (1 c.)  1.50  0.3  1.5  Pretzels  47 (10 twists)  1.30  1.7  0  Popcorn  (homemade)  58 (100 g)  0.50  10.0  0  Skim milk  12 (1 c.)  12.00  0  0  Cream (half and  half)  0.65 (1 Tbs.)  0.02  0  0  Cream substitute  1.0 (1 tsp.)  1.00  0  1.0  Cheddar cheese  1.3 (1 slice)  0.50  0  0  Yogurt (with fruit)  32.3 (6 oz.)  32.30  0  19.4  Caesar dressing  2.8 (

# Exploratory Data Analysis on Text

In [62]:
import pandas as pd

data = pd.DataFrame(pages_and_texts)

data.head()

Unnamed: 0,page_number,page_char_count,page_word_count,page_sentence_count,page_token_count,text
0,-41,29,4,1,7.25,Human Nutrition: 2020 Edition
1,-40,0,1,1,0.0,
2,-39,320,54,1,80.0,Human Nutrition: 2020 Edition UNIVERSITY OF ...
3,-38,212,32,1,53.0,Human Nutrition: 2020 Edition by University of...
4,-37,797,147,3,199.25,Contents Preface University of Hawai‘i at Mā...


In [76]:
data['page_char_count'].max()

2308

In [None]:
data['page_token_count'].max()