In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
path_dir = '/Users/pinaki/Library/CloudStorage/OneDrive-Personal/Documents/Analytics_Projects/POCs_Python/Llm_w_rag_poc/llm_w_rag_poc/'
path_models = path_dir + 'models/'
path_data = path_dir + 'data/'

# Objective: 
An app that answers queries based on documents we have provided.

# Constraints:
1. LLM model: _**offline**_ 
2. Framework: _**langchain**_ | _**llamaindex**_
3. RAG:
    - Vector DB: _**FAISS**_ | _**cromadb**_
4. Frontend: _**streamlit**_


# Steps to carry out
1. Download an **instruction** LLM model (<=7B parameter) for offline use
    Test the model by asking few general questions
2. Build a simple langchain based framework
    Test asking a few specific questions on the topic of interest
3. Build a simple RAG based framework
    Test asking a few specific questions on the topic of interest
4. Build a simple streamlit app
    - Lets the user upload few documents for reference (optional)
    - Allows users to input questions
    - Outputs the answers based on the reference documents
    - Lets the user give a feedback on the quality of the answer
5. Store user question, generated answer, and user feedback in DB

# Code starts here

## Loading packages

In [3]:
from langchain import document_loaders as dl
from langchain import embeddings
from langchain import text_splitter as ts
from langchain import vectorstores as vs
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.runnable import RunnableParallel
from langchain.prompts import PromptTemplate
from operator import itemgetter
#Torch + transformers
import torch
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM
#Other useful modules
import re
import time

## Load document and chunk it

In [4]:
#A document about quantum computing
document_path = path_data + "raw/An-Introduction-to-Philosophy.pdf"  
# document_path = "quantum-mckinsey.pdf"

#we set default chunk size of 500 characters with an overlap of 20 characters
def split_doc(document_path, chunk_size=500, chunk_overlap=20):
    loader = dl.PyPDFLoader(document_path)
    document = loader.load()
    text_splitter = ts.RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    document_splitted = text_splitter.split_documents(documents=document)
    return document_splitted

#Split the document and print the different chunks
document_splitted = split_doc(document_path)
for idx, doc in enumerate(document_splitted):
  if idx%10 == 0:
    print(idx+1, ': ', doc)

1 :  page_content='1 \n  \n \nAn Introduction to Philosophy  \n \n \n \n \n \nW. Russ Payne  PhD \nBellevue College  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \nCopyright (c c by nc 4.0 )  \n2023 W. Russ Payne  \nPermission is granted to copy, distribute and/or modify this document  with attribution  under the \nterms of Creative Commons: Attribution Noncommercial  4.0 International or any later version of \nthis license. A copy of the license is found at http://creativecommons.org/licenses/by -nc/4.0/' metadata={'source': '/Users/pinaki/Library/CloudStorage/OneDrive-Personal/Documents/Analytics_Projects/POCs_Python/Llm_w_rag_poc/llm_w_rag_poc/data/raw/An-Introduction-to-Philosophy.pdf', 'page': 0}
11 :  page_content='4 \n If the flourishing of philosophy over the pas t century or so is to continue , philosophy as a living \ndiscipline will have to gain a broader following among the general educated public. The front \nline for this campaign is the P

## Save the embedding model

In [5]:
from sentence_transformers import SentenceTransformer

path_model_st = path_models + 'sentence-transformers'

model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")
#Save the model locally
model.save(path_model_st)
#release memory (RAM + cache)
del model
torch.cuda.empty_cache()

## Load the model from local directory

In [6]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using mps device


In [7]:
def load_embedding_model(model_path):
    model_kwargs = {'device': device}
    # model_kwargs = {'device': 'cuda:0'}
    encode_kwargs = {'normalize_embeddings': False}
    embedding_model_instance = embeddings.HuggingFaceEmbeddings(
        #Foldername where the model was stored
        model_name=model_path,
        model_kwargs=model_kwargs,
        encode_kwargs=encode_kwargs
    )
    return embedding_model_instance

#Instantiate the embedding model
embedding_model_instance = load_embedding_model(path_model_st)

In [8]:
print("Torch version:",torch.__version__)
print("Is CUDA enabled?",torch.cuda.is_available())
print("Is MPS enabled?", torch.backends.mps.is_available())

Torch version: 2.2.1
Is CUDA enabled? False
Is MPS enabled? True


## Creating Vector DB & storing the chunk embeddings

In [9]:
path_data_db = path_data + 'interim/db.index'

def create_db(document_splitted, embedding_model_instance):

    model_vectorstore = vs.FAISS
    db=None
    try:
        content = []
        metadata = []
        for d in document_splitted:
            content.append(d.page_content)
            metadata.append({'source': d.metadata})
        db=model_vectorstore.from_texts(content, embedding_model_instance, metadata)
    except Exception as error:
        print(error)
    return db

db = create_db(document_splitted, embedding_model_instance)
#store the db locally for future use
db.save_local(path_data_db)

## Load the large language model locally

In [10]:
path_model_llm = path_models + 'zephyr-7b-beta-model'
path_model_tokenizer = path_models + 'zephyr-7b-beta-tokenizer'

tokenizer = AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceH4/zephyr-7b-beta", low_cpu_mem_usage=True, torch_dtype=torch.float16)
model.save_pretrained(path_model_llm, max_shard_size="1000MB")
tokenizer.save_pretrained(path_model_tokenizer)
del model
del tokenizer
# torch.backends.mps.
torch.cuda.empty_cache()

tokenizer_config.json:   0%|          | 0.00/1.43k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/168 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/638 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.89G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/1.95G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/816M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

('/Users/pinaki/Library/CloudStorage/OneDrive-Personal/Documents/Analytics_Projects/POCs_Python/Llm_w_rag_poc/llm_w_rag_poc/models/zephyr-7b-beta-tokenizer/tokenizer_config.json',
 '/Users/pinaki/Library/CloudStorage/OneDrive-Personal/Documents/Analytics_Projects/POCs_Python/Llm_w_rag_poc/llm_w_rag_poc/models/zephyr-7b-beta-tokenizer/special_tokens_map.json',
 '/Users/pinaki/Library/CloudStorage/OneDrive-Personal/Documents/Analytics_Projects/POCs_Python/Llm_w_rag_poc/llm_w_rag_poc/models/zephyr-7b-beta-tokenizer/tokenizer.json')

## load model from local

In [11]:
tokenizer = AutoTokenizer.from_pretrained(path_model_tokenizer)
model = AutoModelForCausalLM.from_pretrained(path_model_llm, low_cpu_mem_usage=True, torch_dtype=torch.float16)
pipe = pipeline(task="text-generation", model=model,tokenizer=tokenizer, device=device, max_new_tokens=1000)
llm=HuggingFacePipeline(pipeline=pipe, model_kwargs={'temperature':0})

Loading checkpoint shards:   0%|          | 0/15 [00:00<?, ?it/s]

## Retrieving pieces of context

In [12]:
query = "Give a summary of what philosophy is in 100 words."
retriever = db.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 6, 'score_threshold': 0.01})
retrieved_docs = retriever.get_relevant_documents(query)

## Defining a prompt template

In [13]:
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
{context}
Question: {question}
Helpful Answer:"""
rag_prompt_custom = PromptTemplate.from_template(template)

## Create chains to perform RAGs

In [14]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

#First chain to query the LLM
rag_chain_from_docs = (
    {
        "context": lambda input: format_docs(input["documents"]),
        "question": itemgetter("question"),
    }
    | rag_prompt_custom
    | llm
    | StrOutputParser()
)

#Second chain to postprocess the answer
rag_chain_with_source = RunnableParallel(
    {"documents": retriever, "question": RunnablePassthrough()}
) | {
    "documents": lambda input: [doc.metadata for doc in input["documents"]],
    "answer": rag_chain_from_docs,
}

In [15]:
t0=time.time()
resp = rag_chain_with_source.invoke(query)
if len(resp['documents'])==0:
  print('No documents found')
else:
  stripped_resp = re.sub(r"\n+$", " ", resp['answer'])
  print(stripped_resp)
  print('Sources',resp['documents'])
  print('Response time:', time.time()-t0)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 Philosophy is the pursuit of rational inquiry into fundamental questions about the nature of reality, knowledge, values, and existence. It involves critical thinking, logical reasoning, and systematic analysis of ideas and arguments. Philosophy is distinct from science, which focuses on empirical observation and experimentation, and from religion, which is based on faith and revelation. While some philosophers argue that certain areas of philosophy, such as metaphysics and ethics, are meaningless or unverifiable, others maintain that they are essential components of a well-rounded education and a healthy society. Ultimately, the nature and value of philosophy are subject to ongoing debate and interpretation.
Sources [{'source': {'source': '/Users/pinaki/Library/CloudStorage/OneDrive-Personal/Documents/Analytics_Projects/POCs_Python/Llm_w_rag_poc/llm_w_rag_poc/data/raw/An-Introduction-to-Philosophy.pdf', 'page': 9}}, {'source': {'source': '/Users/pinaki/Library/CloudStorage/OneDrive-Pe

In [18]:
print(resp['answer'])

 Philosophy is the pursuit of rational inquiry into fundamental questions about the nature of reality, knowledge, values, and existence. It involves critical thinking, logical reasoning, and systematic analysis of ideas and arguments. Philosophy is distinct from science, which focuses on empirical observation and experimentation, and from religion, which is based on faith and revelation. While some philosophers argue that certain areas of philosophy, such as metaphysics and ethics, are meaningless or unverifiable, others maintain that they are essential components of a well-rounded education and a healthy society. Ultimately, the nature and value of philosophy are subject to ongoing debate and interpretation.
