In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install langchain chromadb pypdf sentence-transformers bitsandbytes accelerate

# Importing Libraries and Functions:

In [4]:
import chromadb
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from chromadb.utils import embedding_functions
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.prompts import PromptTemplate
from pypdf import PdfReader
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains import RetrievalQA
from langchain.vectorstores import Chroma
from langchain_community.llms import LlamaCpp
from langchain.document_loaders import TextLoader
from langchain.llms import CTransformers
from torch import cuda, bfloat16
from langchain.text_splitter import RecursiveCharacterTextSplitter, SentenceTransformersTokenTextSplitter
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

In [5]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# llama2 Model and Tokenizer:

In [6]:
model_id= "meta-llama/Llama-2-7b-chat-hf"

model_config = transformers.AutoConfig.from_pretrained(
    model_id,
)

# set quantization configuration to load large model with less GPU memory
# this requires the `bitsandbytes` library
bnb_config = transformers.BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=bfloat16
)


model = transformers.AutoModelForCausalLM.from_pretrained(
model_id,
trust_remote_code=True,
config=model_config,
quantization_config=bnb_config,
device_map='auto',
)



tokenizer = AutoTokenizer.from_pretrained(model_id)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

# Function for Loading the Doucument:

We are removing whitespaces and other stuff becuase, we will not be able to ingest none type objects in vector database so we have to process the pdf before ingesting it in the vector database.

This function will return us the text from our Document after removing extra white spaces and empty strings/pages to prevent errors.:

In [7]:
def load_doc(path):
    reader = PdfReader(path)
    #We will iterate over each page 'p', extract the text and removing the whitespaces before and after sentences.
    pdf_texts = [p.extract_text().strip() for p in reader.pages]

    # Filter the empty strings-->pdf_texts will have only those pages that have the text
    pdf_texts = [text for text in pdf_texts if text]

    return pdf_texts

# Function for Dividing the Documents in Chunks:
We do chunking to ensure that our document is easily processed by LLM and chunks will be easily commited in the Vector Database.

This Function will return us the list of Chunks that are result of our Document Splitting.

Character Splitter is not enogh becuase the embedding model that we are using called SentenceTransformer has limited context window width and exactly uses 256 characters. That is the maximum context window width which is basically maximum legth of text used to understand the that text.

In [8]:
def chunks(path_of_doc):

    #we will use the path of doc to pass it to load_doc
    pdf_texts= load_doc(path_of_doc)
    character_splitter = RecursiveCharacterTextSplitter(
    #RecursiveCharacterTextSplitter will split the Document into chunks firstly when it will find the double line
        #After that if the splitted chunks are greater than size of 1000 which is our chunk size they will get split on single line
            #Even if Chunks have larger size than 1000 they will get split on ". "
        separators=["\n\n", "\n", ". ", " ", ""],
        #every chunk will have 2000 characters
        chunk_size=2000,
        chunk_overlap=20
    )
    #further splitting according to embedding model we are using
    character_split_texts = character_splitter.split_text('\n\n'.join(pdf_texts))
    #so lets split it according to our embedding model
                                                                            #we want every token to have 256 characters
    token_splitter = SentenceTransformersTokenTextSplitter(chunk_overlap=0, tokens_per_chunk=256)
    token_split_texts = []
    for text in character_split_texts:
    #splitting the text and storing in list
        token_split_texts += token_splitter.split_text(text)

    return token_split_texts

# Function for Embeddings:

We will convert our textual data/chunks to numbers or vectors to store them in vector database.
This Function will return us a method to Embedd our textual chunks to numbers.

In [9]:
def embedding_function():
    embedding_function = SentenceTransformerEmbeddingFunction()
    return embedding_function

# Function for Doc/Chunks Ingestion in the Vector Database:
This function will return us the collection of Vector database that will have our chunks vector stored according to semantic meaning and we will able to query this database/collection to get answer from our document that we ingested.

In [10]:
def doc_ingestion_in_vecdb(path_of_doc):
    #we will pass the path of doc to chunks that will further pass it to doc_load for loading it
    token_split_texts= chunks(path_of_doc)
    embedding_func= embedding_function()

    #making chromadb client object good for testing only not for production purpose
    chroma_client = chromadb.Client()

    #making the collection of chroma database
    chroma_collection = chroma_client.create_collection("Doc-Collection", embedding_function=embedding_func)

    #ids of each chunk
    ids = [str(i) for i in range(len(token_split_texts))]

    chroma_collection.add(ids=ids, documents=token_split_texts)
    return chroma_collection

# Function for Retrieving Result from Vector DB:
After ingestion we can query the database directly and it will return us most close results according to semantic meaning.

This Function will return us top 5 results that matches with our query.

In [48]:
def retrieve_doc(query, chroma_collection):
    #we will pass the document path to doc_ingestion_in_vecdb() so that can ingest it in vector db after chunking and loading
    # chroma_collection= doc_ingestion_in_vecdb(path_to_doc)

    #lets get the 5 relevant results
    results = chroma_collection.query(query_texts=[query], n_results=5)

    #[0] means give the result of the first query, right now we have only 1 query
    retrieved_documents = results['documents'][0]

    return retrieved_documents

# Rag Function:

In [14]:
def rag(query, retrieved_documents, model, tokenizer):

    #we will join the retrieved documents (that are the relavant documents) into one variable
    messages = f"""
          You are a helpful expert of Machine Learning and Deep Learning.
          Your users are asking questions about information contained in an Machine Learning and Deep Learning Book."
          You will be shown the user's question, and the relevant information from the book.
          Answer the user's question using only this relevant information. Try to build a good answer uing this information.

          Give Precise answer according to User's Instruction, like if user wants explaination then provide explaination.
          If User want an answer in one or two lines then give answer accordingly.

        User's Question: {query}. \n Information from the book of Machine Learning and Deep Learning: {retrieved_documents}"
    """
    pipe = pipeline("text-generation",
                        model=model,
                        tokenizer=tokenizer)

    llm = HuggingFacePipeline(pipeline=pipe)
    # checking again that our model is working fine--->Asking LLM model the same question we asked our document
    content= llm(prompt=messages)

    return content


# Advance RAG Techniques:
## 1. Query Expansion Using the Hypothetical Answer from LLM:
So Basically instead of directly using our query to get the documents from vector database, firstly we will pass our query to LLM to get a hypothetical answer (that will help us to get answers like that from our document) from the LLM and then we will combine that hypothetical answer with our query to get an accurate and closely related answers/documents from the vector database.



In [15]:
def hypothetical_answer(query, model, tokenizer):
    messages = f"""
        You are a helpful expert Machine Learning and Deep Learning assistant.
        Provide an example answer to the given question,
        that might be found in a documents like from Book related to machine Learning and Deep Learning.

        The Question about which you have to give example answer is: {query}
    """
    pipe = pipeline("text-generation",
                    model=model,
                    tokenizer=tokenizer)

    llm = HuggingFacePipeline(pipeline=pipe)
    # checking again that our model is working fine--->Asking LLM model the same question we asked our document
    content= llm(prompt=query)

    #joining the original query and hypothetical answer
    joint_query = f"{query} {content}"

    return joint_query

In [16]:
original_query = "what is Linear Regression in one line?"
joint_query = hypothetical_answer(original_query, model, tokenizer)
print(joint_query)

  warn_deprecated(


what is Linear Regression in one line? 
 nobody knows.

But in reality, Linear Regression is a statistical model used to establish a linear relationship between a dependent variable (target variable) and one or more independent variables (feature variables). It is a popular and widely used machine learning algorithm that helps to predict the value of a target variable based on the values of one or more feature variables.

Here's a simple example to illustrate Linear Regression:

Suppose we have a dataset of cars and their prices, and we want to know which features of the car (e.g. engine size, number of doors, etc.) are most strongly associated with the price of the car. We can use Linear Regression to estimate the linear relationship between the car's features and its price.

The linear regression model can be represented in one line as follows:

Price = a + b*EngineSize + c*NumberOfDoors + d*OtherFeatures + e

Where a, b, c, d, and e are coefficients that represent the strength and d

**Now if you look above query, it has alot of hypothetical answer that can bring very accurate inforamtion about the query from the document, beacuase vector database will return answer similar to above query.**

### Making Vector DB Collection of our Doc:

In [None]:
path_to_doc= '/content/drive/MyDrive/GenAI/langchain_chorma_llama2_RAG/Aurelie-Geron-Hands-On-Machine-Learning.pdf'
chroma_collection= doc_ingestion_in_vecdb(path_to_doc)

# Getting Relative Docs and Passing them to RAG:
We will get the relative documents using the joint query.

In [27]:
retrieved_documents= retrieve_doc(joint_query, chroma_collection)
retrieved_text= ""
for doc in retrieved_documents:
  retrieved_text+=doc

In [29]:
print(rag(original_query, retrieved_text, model, tokenizer))

1. The linear regression model is a simple model that makes predictions by computing a weighted sum of input features, plus a constant called the bias term.
    2. The linear regression model is trained using a direct "closed-form" equation that computes the model parameters that best fit the training data.
    3. The linear regression model is a predictor, meaning it can make predictions given a new dataset of input features.








## 2. Query Expansion Using the Related Queries:
Now in this technique basically we generate the queries related with our original queries and then we can use these queries to get more appropriate related documents from our vector database.


In [35]:
def generate_multiple_queries(query, model, tokenizer):
    messages = f"""
        You are a helpful expert Machine Learning and Deep Learning assistant.
        Tour Users are asking question related to machine Learning and Deep Learning.
        Suggest up to five additional related questions to help them find the information they need, for the provided question.
        Suggest only short questions without compound sentences. Suggest a variety of questions that cover different aspects of the topic.
        Make sure they are complete questions, and that they are related to the original question.
        Output one question per line. Do not number the questions.

        The question about which you have to generate the question is {query}"""

    pipe = pipeline("text-generation",
                    model=model,
                    tokenizer=tokenizer)

    llm = HuggingFacePipeline(pipeline=pipe)
    content= llm(prompt=messages)
    content = content.split("\n")
    return content

In [36]:
original_query = "What is deep learning?"
augmented_queries = generate_multiple_queries(original_query, model, tokenizer)

for query in augmented_queries:
    print(query)



    Example Output:
        What are the different types of deep learning models?
        How do you preprocess data for deep learning?
        Can deep learning be used for image classification?
        What are the advantages of using deep learning for natural language processing?
        How do you choose the right deep learning architecture for a project?
    Possible Outputs:
        * What are the most common deep learning frameworks?
        * How do you optimize deep learning models for deployment?
        * Can deep learning be used for time series forecasting?
        * What are the challenges of deep learning for beginners?
        * How do you evaluate the performance of deep learning models?
    Note: 
        The output should be in the format of a list of questions.
        Each question should be on a new line.
        The questions should be short and to the point.
        The questions should be related to the original question.


#### Combining Original and Generated Queries and Retrieving the Result:

In [37]:
queries = [original_query] + augmented_queries
retrieved_documents= retrieve_doc(queries, chroma_collection)
retrieved_text= ""
for doc in retrieved_documents:
  retrieved_text+=doc

In [39]:
retrieved_text

'part ii, neural networks and deep learning, covers the following topics : • what are neural nets? what are they good for? • building and training neural nets using tensorflow and keras. • the most important neural net architectures : feedforward neural nets, convolu ‐ tional nets, recurrent nets, long short - term memory ( lstm ) nets, autoencoders and generative adversarial networks ( gans ). • techniques for training deep neural nets. • scaling neural networks for large datasets. • learning strategies with reinforcement learning. • handling uncertainty with bayesian deep learning. the first part is based mostly on scikit - learn while the second part uses tensorflow and keras. don ’ t jump into deep waters too hastily : while deep learning is no doubt one of the most exciting areas in machine learning, you should master the fundamentals first. moreover, most problems can be solved quite well using simpler techniques such as random forests and ensemble methods ( discussed in part i )

#### Using Rag to Answer query from Retrieved Documents:

In [40]:
print(rag(original_query, retrieved_text, model, tokenizer))

1. What is deep learning?
Deep learning is a subset of machine learning that involves the use of neural networks with multiple layers to analyze and interpret complex data sets. It is particularly useful for image and speech recognition, natural language processing, and other applications that require the processing of large amounts of data. The term "deep learning" refers to the use of neural networks with multiple layers, which are capable of learning and improving on their own by analyzing large amounts of data.


**Result above looks Pretty Great :)**

## 2. Cross Encoder Re-Ranking:
Using this technique we can score the retrieval of the documents according to the query that we have set. Basically Every doccument will have a score, that will show how much related it is to our query.

**Sounds Interesting?, lets see what is inside:**

So basically main thing use in it is a Cross Encoder that is is present in the Sentence Transformer, When converting text to embeddings its process text together as single input, this allows model to directly compare and contrast the input, understanding their relation in the better way.

In Cross encoder we get score from 0 to 1, document having score close to 1 is most related query. We will give query and each retrieved document one by one to get the score of each document.

In [41]:
from sentence_transformers import CrossEncoder
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

#### Making Query-Doc Pairs for Comparison and Predicting the Score:

In [52]:
query= 'What is Neural Network?'
retrieved_documents= retrieve_doc(query, chroma_collection)
retrieved_text= ""
for doc in retrieved_documents:
  # retrieved_text+=doc
  print(doc)

8in the 1990s, an ann with more than two hidden layers was considered deep. nowadays, it is common to see anns with dozens of layers, or even hundreds, so the definition of “ deep ” is quite fuzzy. 9 “ learning internal representations by error propagation, ” d. rumelhart, g. hinton, r. williams ( 1986 ). multi - layer perceptron and backpropagation an mlp is composed of one ( passthrough ) input layer, one or more layers of tlus, called hidden layers, and one final layer of tlus called the output layer ( see figure 10 - 7 ). the layers close to the input layer are usually called the lower layers, and the ones close to the outputs are usually called the upper layers. every layer except the output layer includes a bias neuron and is fully connected to the next layer. figure 10 - 7. multi - layer perceptron the signal flows only in one direction ( from the inputs to the out ‐ puts ), so this architecture is an example of a feedforward neural net ‐ work ( fnn ). when an ann contains a dee

In [53]:
pairs = [[query, doc] for doc in retrieved_documents]
scores = cross_encoder.predict(pairs)
print("Scores:")
for score in scores:
    print(score)

Scores:
-9.233899
3.365209
1.0348009
0.7302442
-6.1474347


**As you can see 4th result, 3rd, and 2nd results are matching more because they have high score**

Lets Check the Highest and Lowest scored results:

In [57]:
retrieved_documents[3]

'14 “ wide & deep learning for recommender systems, ” heng - tze cheng et al. ( 2016 ). as you can see, the sequential api is quite easy to use. however, although sequential models are extremely common, it is sometimes useful to build neural networks with more complex topologies, or with multiple inputs or outputs. for this purpose, keras offers the functional api. building complex models using the functional api one example of a non - sequential neural network is a wide & deep neural network. this neural network architecture was introduced in a 2016 paper by heng - tze cheng et al. 14. it connects all or part of the inputs directly to the output layer, as shown in figure 10 - 13. this architecture makes it possible for the neural network to learn both deep patterns ( using the deep path ) and simple rules ( through the short path ). in contrast, a regular mlp forces all the data to flow through the full stack of layers, thus simple patterns in the data may end up being distorted by th

In [59]:
retrieved_documents[0]

'8in the 1990s, an ann with more than two hidden layers was considered deep. nowadays, it is common to see anns with dozens of layers, or even hundreds, so the definition of “ deep ” is quite fuzzy. 9 “ learning internal representations by error propagation, ” d. rumelhart, g. hinton, r. williams ( 1986 ). multi - layer perceptron and backpropagation an mlp is composed of one ( passthrough ) input layer, one or more layers of tlus, called hidden layers, and one final layer of tlus called the output layer ( see figure 10 - 7 ). the layers close to the input layer are usually called the lower layers, and the ones close to the outputs are usually called the upper layers. every layer except the output layer includes a bias neuron and is fully connected to the next layer. figure 10 - 7. multi - layer perceptron the signal flows only in one direction ( from the inputs to the out ‐ puts ), so this architecture is an example of a feedforward neural net ‐ work ( fnn ). when an ann contains a de