<a href="https://colab.research.google.com/github/yojuna/local_llm_RAG/blob/main/schrodinger_what_is_life_mistral_RAG.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Notebook running a local Mistral 7b Instruct Model, chained with Retrieval Augmented Generation (RAG), for conversing with the legendary collection of essays, in Erwin Schrödinger's What Is Life?

In [3]:
# colab autoreload

%load_ext autoreload
%autoreload 2

In [25]:
## Installation/setup

# Reading in PDF Files
!pip install -q -U pypdf
# Setting Up Vector Store
!pip install -q -U chromadb
# Using Llama-7b-GPTQ LLM model in HuggingFace
!pip install q -U torch auto-gptq transformers optimum
# LangChain - Loading PDFs, Text Chunking, BGE Embeddings, Retrieval QA Chain
!pip install -q -U langchain sentence_transformers

!pip install -q -U torch datasets transformers tensorflow langchain playwright html2text sentence_transformers faiss-cpu
!pip install -q accelerate==0.21.0 peft==0.4.0 bitsandbytes==0.40.2 trl==0.4.7

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m284.0/284.0 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m509.0/509.0 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m22.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.7/60.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.7/40.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.4/5.4 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.4/6.4 MB[0m [31m52.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
auto-gptq 0.6.0 requires accelerate>=0.22.0, but you have accelerate 0.21.0 which is incompatible.
auto-gptq 0.6.0 requires peft>=0.5.0, but you have peft 0.4.0 which is incompatible.[0m[31m
[0m

In [2]:

# Import torch
import torch

# Import for loading PDFs from Google Drive.
# Note: Not needed if GDrive is already mounted or we are using wget to get files from Web.
# from google.colab import drive

# Imports to read PDF and setup Chroma Vector Store
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings

# Imports for LLM
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain import PromptTemplate  #, LLMChain

# Imports for QA Retrieval Chain
from langchain.chains import RetrievalQA

# Import to Clenup LLM Output
import textwrap


In [5]:
import os
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline
)
from datasets import load_dataset
from peft import LoraConfig, PeftModel

from langchain.text_splitter import CharacterTextSplitter
from langchain.document_transformers import Html2TextTransformer
from langchain.document_loaders import AsyncChromiumLoader

from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# Imports to read PDF and setup Chroma Vector Store
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.llms import HuggingFacePipeline

## Setup the LLM

In [6]:
#################################################################
# Tokenizer
#################################################################

model_name='mistralai/Mistral-7B-Instruct-v0.2'

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

#################################################################
# bitsandbytes parameters
#################################################################

# Activate 4-bit precision base model loading
use_4bit = True

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False

#################################################################
# Set up quantization config
#################################################################
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

#################################################################
# Load pre-trained config
#################################################################
mistral_model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [7]:
# check/get the number of trainable parameters

def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

print(print_number_of_trainable_model_parameters(mistral_model))



trainable model parameters: 262410240
all model parameters: 3752071168
percentage of trainable model parameters: 6.99%


In [8]:
# setup the LLM pipeline

standalone_query_generation_pipeline = pipeline(
 model=mistral_model,
 tokenizer=tokenizer,
 task="text-generation",
 temperature=0.0,
 repetition_penalty=1.1,
 return_full_text=True,
 max_new_tokens=1000,
)
standalone_query_generation_llm = HuggingFacePipeline(pipeline=standalone_query_generation_pipeline)

response_generation_pipeline = pipeline(
 model=mistral_model,
 tokenizer=tokenizer,
 task="text-generation",
 temperature=0.2,
 repetition_penalty=1.1,
 return_full_text=True,
 max_new_tokens=1000,
)
response_generation_llm = HuggingFacePipeline(pipeline=response_generation_pipeline)

## Get the documents/ data

In [21]:
# Get the Document / Textbook

## Need to run only once

# ## Feynman lectures on physics

# ! mkdir -p docs
# ! wget https://antilogicalism.com/wp-content/uploads/2018/04/feynman-lectures.pdf -O docs/feynman-lectures.pdf

! wget https://archive.org/download/feynman-lectures-on-physics-volumes-1-2-3-feynman-leighton-and-sands/Feynman%20Lectures%20on%20Physics%20Volumes%201%2C2%2C3%20-%20Feynman%2C%20Leighton%20and%20Sands.pdf -O docs/archive-feynman-lectures.pdf

! wget http://strangebeautiful.com/other-texts/schrodinger-what-is-life-mind-matter-auto-sketches.pdf -O docs/what-is-life.pdf

--2024-01-29 18:24:03--  https://archive.org/download/feynman-lectures-on-physics-volumes-1-2-3-feynman-leighton-and-sands/Feynman%20Lectures%20on%20Physics%20Volumes%201%2C2%2C3%20-%20Feynman%2C%20Leighton%20and%20Sands.pdf
Resolving archive.org (archive.org)... 207.241.224.2
Connecting to archive.org (archive.org)|207.241.224.2|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://ia902508.us.archive.org/29/items/feynman-lectures-on-physics-volumes-1-2-3-feynman-leighton-and-sands/Feynman%20Lectures%20on%20Physics%20Volumes%201%2C2%2C3%20-%20Feynman%2C%20Leighton%20and%20Sands.pdf [following]
--2024-01-29 18:24:03--  https://ia902508.us.archive.org/29/items/feynman-lectures-on-physics-volumes-1-2-3-feynman-leighton-and-sands/Feynman%20Lectures%20on%20Physics%20Volumes%201%2C2%2C3%20-%20Feynman%2C%20Leighton%20and%20Sands.pdf
Resolving ia902508.us.archive.org (ia902508.us.archive.org)... 207.241.228.218
Connecting to ia902508.us.archive.org (ia902508.u

In [22]:
# load document from directory

loader = DirectoryLoader('docs/', glob="./*.pdf", loader_cls=PyPDFLoader)

documents = loader.load()

In [23]:
# number of pages in the pdf
len(documents)

1572

In [24]:
# LLM Token Chunksize varies based on Context Window. LLaMA2 Context Window is 4096 tokens.
# For QA want to pick larger chunk size with some overlap to get context.
CHUNK_SIZE, CHUNK_OVERLAP = 1000, 200


text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE,
                                               chunk_overlap=CHUNK_OVERLAP)
texts = text_splitter.split_documents(documents)

len(texts)

764

In [25]:
texts[700]

Document(page_content='MindandMatter\n(notably intheZeeman andStarkeffects)someofthespectral\nlinesarepolarized. Tocomplete thephysical description in\nthisrespect, inwhichthehumaneyeisentirely insensitive,\nyouputapolarizer (aNicolprism)inthepathofthebeam,\nbeforedecomposing it;onslowlyrotating theNicolaroundits\naxiscertain linesareextinguished orreduced tominimal\nbrightness forcertainorientations oftheNicol,whichindicate\nthedirection (orthogonal tothebeam)oftheirtotalorpartial\npolariza tion.\nOncethiswholetechnique isdeveloped, itcanbeextended\nfarbeyond thevisibleregion.Thespectral linesofglowing\nvapours arebynomeansrestricted tothevisibleregion,which\nisnotdistinguished physically. Thelinesformlong,theoret\xad\nicallyinfinite series.Thewave-lengths ofeachseriesare\nconnected byarelatively simplemathematical law,peculiar\ntoit,thatholdsuniformly throughout theserieswithno\ndistinctionofthatpartoftheseriesthathappens tolieinthe\nvisibleregion.Theseseriallawswerefirstfoundempiric

### Alternative Data sources

#### Alternative: Extract data from News APIs

In [None]:
## Optional: If extracting News data using Google News API
# google news api

!pip install GoogleNews

!pip install newspaper3k

In [None]:
from GoogleNews import GoogleNews
from newspaper import Article
import pandas as pd

In [None]:
googlenews=GoogleNews(start='05/01/2024',end='28/01/2024')
googlenews.search('Finance')
result=googlenews.result()
df=pd.DataFrame(result)
print(df.head())

In [None]:
df

In [None]:
# get more articles by looping through

NUM_SEARCH_PAGES = 10

for i in range(2, NUM_SEARCH_PAGES):
    googlenews.getpage(i)
    result=googlenews.result()
    df=pd.DataFrame(result)

#### Alternative: Extract web pages/ blog articles

In [None]:
## uncomment if smooth browser functionality is required for using chrome/firefox web drivers

# !playwright install
# !playwright install-deps

In [None]:
# Alternative
# Inference over URL, using chromium driver



import nest_asyncio
nest_asyncio.apply()

# Articles to index
## Andrej Karpathy: Software 2.0 Article
articles = ["https://karpathy.medium.com/software-2-0-a64152b37c35",]

# Scrapes the blogs above
loader = AsyncChromiumLoader(articles)
docs = loader.load()

## Create embeddings and vector db

Create Retriever Embeddings - HF BGE Embeddings

BGE Embeddings are at the top of the leader board on Hugging Face (https://huggingface.co/spaces/mteb/leaderboard).


In [26]:
# BGE Embedding Model for Retrieval. Embedding Size is 768.
model_name = "BAAI/bge-base-en"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model_embedding = HuggingFaceBgeEmbeddings(
                    model_name=model_name,
                    model_kwargs={'device': 'cuda'},
                    encode_kwargs=encode_kwargs
                  )

Create the Vector DB Store Using Chroma DB

In [27]:
%%time
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
# Creating Vector Store takes ~ 2 mins

persist_directory = 'db'

## Here is the nmew embeddings being used
embedding = model_embedding

vectordb = Chroma.from_documents(documents=texts,
                                 embedding=embedding,
                                 persist_directory=persist_directory)

CPU times: user 14.6 s, sys: 20.1 ms, total: 14.6 s
Wall time: 14.3 s


In [14]:
# Returns the Top-k chunks from vectordb. Set to 2 to check.
retriever = vectordb.as_retriever(search_kwargs={"k": 2})

### Check retrieval from Chroma db

In [None]:
# # Approach 1: Use Query to do Similarity Search

query = "What is The Physical Basis of Consciousness?"


docs = vectordb.similarity_search(query)

print("# of results: ", len(docs))

# First page
print(docs[0].page_content)
print("\n")
print(docs[0].metadata)

# Last page of results
print(docs[-1].page_content)
print("\n")
print(docs[-1].metadata)

In [None]:
# # Approach 2: Use Embedding Vector to do Similarity Search
embedding_vector = embedding.embed_query(query)

docs = vectordb.similarity_search_by_vector(embedding_vector)

print("# of results: ", len(docs))

# First page
print(docs[0].page_content)
print("\n")
print(docs[0].metadata)

# Last page of results
print(docs[-1].page_content)
print("\n")
print(docs[-1].metadata)

## Run LLM on data

In [32]:
prompt = "What is Consciousness?"

prompt_template=f'''[INST] <>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<>
{prompt}[/INST]

'''

print("\n\n*** Generate:")

print(response_generation_pipeline(prompt_template)[0]['generated_text'])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.




*** Generate:
[INST] <>
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe.  Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<>
What is Consciousness?[/INST]

Consciousness refers to an individual's subjective experience of the world around them. It involves the ability to perceive, process, and respond to information from the environment, as well as having self-awareness and introspective abilities. The exact nature of consciousness and how it arises from physical processes in the brain is still a topic of ongoing research and debate among scientists and philosophers. Some theories s

## Setup RAG Chain

RAG Chain = LLM + Retriever + Query Prompt

In [33]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

In [35]:
qa_chain = RetrievalQA.from_chain_type(llm=response_generation_llm,
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

### Check RAG responses

In [38]:
query = "What are the author's thoughts on Determinism and Free Will?"

llm_response = qa_chain(query)
llm_response['result'].split('\n')

  warn_deprecated(
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


[" According to Schrödinger, based on the evidence presented, the space-time events in a living being which correspond to its mind's activity or any other actions are statistically deterministic. However, he does not believe that quantum indeterminacy plays a significant role biologically, except perhaps in enhancing the purely accidental character of certain events like meiosis and natural mutation. He emphasizes that this view is not in conflict with physics, as even clockwork is ultimately statistical in nature. Schrödinger also acknowledges that many scientists who have made fundamental contributions in biology have been influenced by his ideas, despite some disagreement. He laments that these insights are still ignored by a disconcertingly large proportion of people who should know better. He concludes by expressing the importance of understanding the relationship between determinism and free will, which he believes is a hard task that requires further consideration."]

In [39]:
query = "Is Life Based on the Laws of Physics? Give your own thoughts about this in the end."

llm_response = qa_chain(query)
llm_response['result'].split('\n')

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


['',
 'Erwin Schrödinger, in his book "What is Life?" discusses the idea that the laws of physics can explain the behavior of living organisms. He argues that while the laws of physics are important, they alone do not fully explain the complex behaviors exhibited by living systems. He suggests that there may be new types of physical laws that govern these unique properties.',
 '',
 'Schrödinger uses the example of a clock to illustrate his point. While a clock follows the laws of physics, it is not purely mechanical because it requires winding and springs to keep running. Similarly, living organisms exhibit behaviors that cannot be explained solely by the known laws of physics.',
 '',
 'He also mentions the concept of entropy, which is a measure of disorder or randomness in a system. The second law of thermodynamics states that the total entropy of a closed system always increases over time. However, living organisms maintain a certain level of order, defying this expectation. Schrödin

## RAG with better prompting

In [40]:
## Default LLaMA-2 prompt style // taken from example that used Llama2 and not Mistral-7b

B_INST, E_INST = "[INST]", "[/INST]"
B_SYS, E_SYS = "<>\n", "\n<>\n\n"
DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""

def get_prompt(instruction, new_system_prompt=DEFAULT_SYSTEM_PROMPT ):
    SYSTEM_PROMPT = B_SYS + new_system_prompt + E_SYS
    prompt_template =  B_INST + SYSTEM_PROMPT + instruction + E_INST
    return prompt_template

In [41]:
sys_prompt = """You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. """

instruction = """CONTEXT:/n/n {context}/n

Question: {question}"""

get_prompt(instruction, sys_prompt)

"[INST]<>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.\n\nIf a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information. \n<>\n\nCONTEXT:/n/n {context}/n\n\nQuestion: {question}[/INST]"

In [42]:
prompt_template = get_prompt(instruction, sys_prompt)

mistral_prompt = PromptTemplate(
    template=prompt_template, input_variables=["context", "question"]
)

In [43]:
chain_type_kwargs = {"prompt": mistral_prompt}

In [44]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

In [45]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm=response_generation_llm,
                                       chain_type="stuff",
                                       retriever=retriever,
                                       chain_type_kwargs=chain_type_kwargs,
                                       return_source_documents=True)

In [46]:
## Cite sources
def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')

    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]

    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)

    return wrapped_text

def process_llm_response(llm_response):
    print(wrap_text_preserve_newlines(llm_response['result']))
    print('\n\nSources:')
    for source in llm_response["source_documents"]:
        print(source.metadata['source'])




#### Use Prompted RAG to Answer Some contextual Questions

RAG with better prompting gives us good responses. It figures out we are talking about specifically.

It also provides the source chunks we used to provide the answer which makes it easier to verify the response.


In [47]:
# Example 1
query = "What is Entropy? Explain in detail what Schrodinger is talking about in this context."

llm_response = qa_chain(query)
process_llm_response(llm_response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 Entropy is a physical property measured in calories per degree Celsius (cal/oC) that quantifies the disorder
or randomness of a system. In the context of Erwin Schrödinger's discussion, he explains that entropy is
related to the statistical concept of order and disorder, revealed through investigations in statistical
physics by Boltzmann and Gibbs. The relationship is expressed mathematically as entropy = k log D, where k is
the Boltzmann constant and Da is a quantitative measure of atomistic disorder of the body in question.
Schrodinger also mentions that the unit in which entropy is measured is the calorie, and he justifies this
definition to remove entropy from the atmosphere of mystery that often surrounds it. He further discusses
Nernst's theorem, which states that a physical system displays "dynamical law" or clock-work features only at
absolute zero temperature, and quantum theory provides the rational foundation for this fact. Schrodinger also
emphasizes the importance of the 

In [48]:
# Example 2
## relevant question for the Feynman Lectures textbook

query = "What is Conservation of Energy?"

llm_response = qa_chain(query)
process_llm_response(llm_response)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


 The Conservation of Energy is a fundamental law of physics stating that the total energy of a closed system
remains constant, regardless of the changes that occur within the system. It applies to all natural phenomena
and has no known exceptions. Energy can exist in various forms such as gravitational, kinetic, heat, elastic,
electrical, chemical, radiant, and nuclear energy, and their respective formulas can be added up to obtain the
total energy of the system, which remains constant unless energy is added or removed from the system.


Sources:
docs/feynman-lectures.pdf
docs/archive-feynman-lectures.pdf
docs/feynman-lectures.pdf
docs/archive-feynman-lectures.pdf
docs/feynman-lectures.pdf


## Alternative approach

Create PromptTemplate and LLMChain

ref: [github: madhavthaker1 / llm/rag/conversational_rag.ipynb](https://github.com/madhavthaker1/llm/blob/main/rag/conversational_rag.ipynb)

In [49]:

# imports

from langchain.schema import format_document
from langchain_core.messages import get_buffer_string
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain.memory import ConversationBufferMemory
from langchain.prompts.prompt import PromptTemplate
from langchain_core.prompts.chat import ChatPromptTemplate

from operator import itemgetter
from pprint import pprint

In [51]:
_template = """
[INST]
Given the following conversation and a follow up question, rephrase the follow up question to be a standalone question, in its original language, that can be used to query a Chroma DB vector index. This query will be used to retrieve documents with additional context.

Let me share a couple examples that will be important.

If you do not see any chat history, you MUST return the "Follow Up Input" as is:

```
Chat History:

Follow Up Input: What is Entropy?
Standalone Question:
What is Entropy?
```

If this is the second question onwards, you should properly rephrase the question like this:

```
Chat History:
Human: What is Entropy?
AI:
Entropy is a physical property measured in calories per degree Celsius (cal/oC) that quantifies the disorder or randomness of a system.

Follow Up Input: How is it measured?
Standalone Question:
How is Entropy measured?
```

Now, with those examples, here is the actual chat history and input question.

Chat History:
{chat_history}

Follow Up Input: {question}
Standalone question:
[your response here]
[/INST]
"""

CONDENSE_QUESTION_PROMPT = PromptTemplate.from_template(_template)

In [52]:
template = """
[INST]
Answer the question based only on the following context:
{context}

Question: {question}
[/INST]
"""

ANSWER_PROMPT = ChatPromptTemplate.from_template(template)

In [53]:
DEFAULT_DOCUMENT_PROMPT = PromptTemplate.from_template(template="{page_content}")


def _combine_documents(
    docs, document_prompt=DEFAULT_DOCUMENT_PROMPT, document_separator="\n\n"
):
    doc_strings = [format_document(doc, document_prompt) for doc in docs]
    return document_separator.join(doc_strings)

In [54]:
# Instantiate ConversationBufferMemory
memory = ConversationBufferMemory(
 return_messages=True, output_key="answer", input_key="question"
)

# First we add a step to load memory
# This adds a "memory" key to the input object
loaded_memory = RunnablePassthrough.assign(
    chat_history=RunnableLambda(memory.load_memory_variables) | itemgetter("history"),
)
# Now we calculate the standalone question
standalone_question = {
    "standalone_question": {
        "question": lambda x: x["question"],
        "chat_history": lambda x: get_buffer_string(x["chat_history"]),
    }
    | CONDENSE_QUESTION_PROMPT
    | standalone_query_generation_llm,
}
# Now we retrieve the documents
retrieved_documents = {
    "docs": itemgetter("standalone_question") | retriever,
    "question": lambda x: x["standalone_question"],
}
# Now we construct the inputs for the final prompt
final_inputs = {
    "context": lambda x: _combine_documents(x["docs"]),
    "question": itemgetter("question"),
}
# And finally, we do the part that returns the answers
answer = {
    "answer": final_inputs | ANSWER_PROMPT | response_generation_llm,
    "question": itemgetter("question"),
    "context": final_inputs["context"]
}
# And now we put it all together!
final_chain = loaded_memory | standalone_question | retrieved_documents | answer

In [55]:
def call_conversational_rag(question, chain, memory):
    """
    Calls a conversational RAG (Retrieval-Augmented Generation) model to generate an answer to a given question.

    This function sends a question to the RAG model, retrieves the answer, and stores the question-answer pair in memory
    for context in future interactions.

    Parameters:
    question (str): The question to be answered by the RAG model.
    chain (LangChain object): An instance of LangChain which encapsulates the RAG model and its functionality.
    memory (Memory object): An object used for storing the context of the conversation.

    Returns:
    dict: A dictionary containing the generated answer from the RAG model.
    """

    # Prepare the input for the RAG model
    inputs = {"question": question}

    # Invoke the RAG model to get an answer
    result = chain.invoke(inputs)

    # Save the current question and its answer to memory for future context
    memory.save_context(inputs, {"answer": result["answer"]})

    # Return the result
    return result

### Ask yer' questions

In [57]:
# Initial Question

question = "What is the Hereditary Mechanism?"

result = call_conversational_rag(question, final_chain, memory)

print(result)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'answer': 'The process by which genetic information is transmitted from parents to their offspring involves the separation of chromosomes during meiosis in the mutant organism. This results in the creation of gametes, each carrying one copy of the altered chromosome. When these gametes combine during fertilization, the resulting zygote contains one altered chromosome from each parent. This leads to the expression of the mutation in the offspring. Mutations are inherited as perfectly as the original, unchanged characteristics were, making them a change in the hereditary treasure that must be accounted for by some change in the hereditary substance. Most important breeding experiments have revealed the mechanism of heredity through careful analysis of the offspring obtained by crossing mutated individuals with non-mutated or differently mutated ones.', 'question': 'What is the process by which genetic information is transmitted from parents to their offspring?', 'context': "separation o

In [60]:
# pretty print for easier reading in the notebook
pprint(result)

{'answer': 'The process by which genetic information is transmitted from '
           'parents to their offspring involves the separation of chromosomes '
           'during meiosis in the mutant organism. This results in the '
           'creation of gametes, each carrying one copy of the altered '
           'chromosome. When these gametes combine during fertilization, the '
           'resulting zygote contains one altered chromosome from each parent. '
           'This leads to the expression of the mutation in the offspring. '
           'Mutations are inherited as perfectly as the original, unchanged '
           'characteristics were, making them a change in the hereditary '
           'treasure that must be accounted for by some change in the '
           'hereditary substance. Most important breeding experiments have '
           'revealed the mechanism of heredity through careful analysis of the '
           'offspring obtained by crossing mutated individuals with '
         

In [62]:
# follow up question with generic mention of parents

question = "What is the siginificance of the chromosomes of the parents?"

pprint(call_conversational_rag(question, final_chain, memory))

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


{'answer': 'Chromosomes from each parent play a crucial role in the creation '
           'of genetically diverse offspring through meiosis. During meiosis, '
           'homologous chromosomes pair up and exchange genetic material '
           'through a process called crossing-over. This results in the '
           'shuffling of genes between the chromosomes from each parent, '
           'leading to new combinations of alleles in the offspring. The '
           'separation of these paired chromosomes during meiosis ensures that '
           'each gamete receives one chromosome from each pair, ensuring '
           'genetic diversity in the resulting offspring.',
 'context': 'separation ofthetwochromosomes onmeiosisinthemutant\xad\n'
            "asshown,veryschematically, inFig.g.Thisisa'pedigree',\n"
            'representing everyindividual (ofthreeconsecutive genera\xad\n'
            'tions)simplybythepairofchromosomes inquestion. Please\n'
            'realizethatifthemutanthad

## Credits

references:

### Code:


https://github.com/jai-llm/RAG_Docs_LLaMA2/blob/main/RAG_HastieBooks_chromaDB_V3.ipynb

https://github.com/madhavthaker1/llm/blob/main/rag/conversational_rag.ipynb ; https://medium.com/@thakermadhav/part-2-build-a-conversational-rag-with-langchain-and-mistral-7b-6a4ebe497185

https://blog.llamaindex.ai/introducing-rags-your-personalized-chatgpt-experience-over-your-data-2b9d140769b1

### Data:

http://strangebeautiful.com/other-texts/schrodinger-what-is-life-mind-matter-auto-sketches.pdf

https://archive.org/details/feynman-lectures-on-physics-volumes-1-2-3-feynman-leighton-and-sands

https://karpathy.medium.com/software-2-0-a64152b37c35
