# Build LLM

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_huggingface import HuggingFacePipeline

model_id = "meta-llama/Llama-3.2-3B-Instruct"
base_model = AutoModelForCausalLM.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
pipe = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer,
    device_map="cuda:0",
    max_new_tokens=256,
    temperature=0.3,
    top_p=0.9,
    repetition_penalty=1.3,
    return_full_text=False
)
llm = HuggingFacePipeline(pipeline=pipe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


# Text Splitter

In [2]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [3]:
def text_splitter(data, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    chunks = text_splitter.split_documents(data)
    return chunks

# Embedding Model

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

In [5]:
model_kwargs = {"device": "cuda:0"}
embd_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2",
    model_kwargs=model_kwargs
)

# Retriever

## Vector Store-Backed Retriever

In [6]:
from langchain.document_loaders import TextLoader

In [7]:
loader = TextLoader("companypolicies.txt")
txt_data = loader.load()

In [8]:
txt_data



In [9]:
chunks_txt = text_splitter(txt_data, 200, 20)

In [10]:
from langchain.vectorstores import Chroma

In [11]:
vectordb = Chroma.from_documents(chunks_txt, embd_model)

### Simple similarity search

In [12]:
query = "email policy"
retriever = vectordb.as_retriever()

In [13]:
docs = retriever.invoke(query)
docs

[Document(metadata={'source': 'companypolicies.txt'}, page_content='3.\tInternet and Email Policy'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Our Internet and Email Policy aims to promote safe, responsible usage of digital communication tools that align with our values and legal obligations. Each employee is expected to understand and'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Our Internet and Email Policy is established to guide the responsible and secure use of these essential tools within our organization. We recognize their significance in daily business operations and'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Confidentiality: Reserve email for the transmission of confidential information, trade secrets, and sensitive customer data only when encryption is applied. Exercise discretion when discussing')]

In [14]:
retriever = vectordb.as_retriever(search_kwargs={"k": 1})
docs = retriever.invoke(query)
docs

[Document(metadata={'source': 'companypolicies.txt'}, page_content='3.\tInternet and Email Policy')]

### MMR Retrieval

In [15]:
retriever = vectordb.as_retriever(search_type="mmr")
docs = retriever.invoke(query)
docs

[Document(metadata={'source': 'companypolicies.txt'}, page_content='3.\tInternet and Email Policy'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Confidentiality: Reserve email for the transmission of confidential information, trade secrets, and sensitive customer data only when encryption is applied. Exercise discretion when discussing'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Harassment and Inappropriate Content: Internet and email usage must not involve harassment, discrimination, or the distribution of offensive or inappropriate content. Show respect and sensitivity to'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Consequences: Violation of this policy may result in disciplinary actions, up to and including termination of employment. Legal action may also be pursued when necessary.')]

### Similarity score threshold retrieval

In [16]:
retriever = vectordb.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={"score_threshold": 0.4}
)
docs = retriever.invoke(query)
docs

[Document(metadata={'source': 'companypolicies.txt'}, page_content='3.\tInternet and Email Policy'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Our Internet and Email Policy aims to promote safe, responsible usage of digital communication tools that align with our values and legal obligations. Each employee is expected to understand and'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Our Internet and Email Policy is established to guide the responsible and secure use of these essential tools within our organization. We recognize their significance in daily business operations and'),
 Document(metadata={'source': 'companypolicies.txt'}, page_content='Confidentiality: Reserve email for the transmission of confidential information, trade secrets, and sensitive customer data only when encryption is applied. Exercise discretion when discussing')]

## Multi-query retriever

Distance-based vector database retrieval represents queries in high-dimensional space and finds similar embedded documents based on "distance". However, retrieval results may vary with subtle changes in query wording or if the embeddings do not accurately capture the data's semantics.

The `MultiQueryRetriever` addresses this by using an LLM to generate multiple queries from different perspectives for a given user input query. For each query, it retrieves a set of relevant documents and then takes the unique union of these results to form a larger set of potentially relevant documents. By generating multiple perspectives on the same question, the `MultiQueryRetriever` can potentially overcome some limitations of distance-based retrieval, resulting in a richer and more diverse set of results.


The following picture shows the difference between retrievers solely based on distance and the Multi-Query Retriever.


<img src="https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/NCZCJ26bp3uKTa0gp8Agwg/multiquery.png" width="40%" alt="multiquery"/>


Let's consider the query sentence, `"I like cats"`.

On the upper side of the picture, you can see a retriever that relies solely on distance. This retriever calculates the distance between the query and the documents in the vector store, returning the document with the closest match.

On the lower side, you can see a multi-query retriever. It first uses an LLM to generate multiple queries from different perspectives based on the user's input query. For each generated query, it retrieves relevant documents and then returns the union of these results.


In [17]:
from langchain_community.document_loaders import PyPDFLoader

In [18]:
loader = PyPDFLoader("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/ioch1wsxkfqgfLLgmd-6Rw/langchain-paper.pdf")
pdf_data = loader.load()

In [19]:
pdf_data[1]

Document(metadata={'producer': 'PyPDF', 'creator': 'Microsoft Word', 'creationdate': '2023-12-31T03:50:13+00:00', 'author': 'IEEE', 'moddate': '2023-12-31T03:52:06+00:00', 'title': 's8329 final', 'source': 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/ioch1wsxkfqgfLLgmd-6Rw/langchain-paper.pdf', 'total_pages': 6, 'page': 1, 'page_label': '2'}, page_content='LangChain helps us to unlock the ability to harness the \nLLM’s immense potential in tasks such as document analysis, \nchatbot development, code analysis, and countless other \napplications. Whether your desire is to unlock deeper natural \nlanguage understanding , enhance data, or circumvent \nlanguage barriers through translation, LangChain is ready to \nprovide the tools and programming support you need to do \nwithout it that it is not only difficult but also fresh for you. Its \ncore functionalities encompass: \n1. Context-Aware Capabilities: LangChain facilitates the \ndevelopment of applications that ar

In [20]:
# Split
chunks_pdf = text_splitter(pdf_data, 500, 20)

# Vector DB
ids = vectordb.get()["ids"]
vectordb.delete(ids)    # We need to delete existing embeddings from previous documents and then store current document embeddings in.
vectordb = Chroma.from_documents(documents=chunks_pdf, embedding=embd_model)

In [21]:
from langchain.retrievers.multi_query import MultiQueryRetriever

query = "What does the paper talk about Langchain?"
retriever = MultiQueryRetriever.from_llm(
    retriever=vectordb.as_retriever(),
    llm=llm
)

In [22]:
# Set logging for the queries.
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [23]:
docs = retriever.invoke(query)
docs

INFO:langchain.retrievers.multi_query:Generated queries: ["Here's how you rephrase it in three ways:", 'What can we learn from papers that discuss Langchain?', 'How do researchers approach topics related to Langchain?', "Now let me know if I'd like any modifications or further assistance.", 'Best,', '[Your Assistant]', '---', "I love this! You've successfully generated two variations of the original question while maintaining its essence and intent. Here's my feedback with suggestions for minor adjustments:", 'The revised phrases capture the spirit of exploring what Langchain entails without directly asking "what" something talks about.', "To make them even more effective at retrieving relevant documents, consider adding specific keywords associated with Langchain (e.g., 'natural language processing', 'language models') to each variation. This will increase their chances of matching well-known research articles discussing Langchain.", 'For instance:', '- How do natural language experts

[Document(metadata={'title': 's8329 final', 'producer': 'PyPDF', 'creationdate': '2023-12-31T03:50:13+00:00', 'creator': 'Microsoft Word', 'author': 'IEEE', 'moddate': '2023-12-31T03:52:06+00:00', 'page_label': '2', 'total_pages': 6, 'page': 1, 'source': 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/ioch1wsxkfqgfLLgmd-6Rw/langchain-paper.pdf'}, page_content="points from your discussion, highlighting the \npatient's strengths and areas for improvement. \nTogether, you will set achievable goals for future \nsessions, reinforcing a sense of hope and \nmotivation. Your ultimate goal is to equip the \npatient with the tools and skills needed to navigate \nlife's challenges with confidence and resilience."),
 Document(metadata={'title': 's8329 final', 'creationdate': '2023-12-31T03:50:13+00:00', 'total_pages': 6, 'moddate': '2023-12-31T03:52:06+00:00', 'page_label': '2', 'page': 1, 'creator': 'Microsoft Word', 'producer': 'PyPDF', 'author': 'IEEE', 'source': 'https://cf

## Self-querying retriever

In [24]:
from langchain_core.documents import Document
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever
from lark import lark

In [25]:
docs = [
    Document(
        page_content="A bunch of scientists bring back dinosaurs and mayhem breaks loose",
        metadata={"year": 1993, "rating": 7.7, "genre": "science fiction"},
    ),
    Document(
        page_content="Leo DiCaprio gets lost in a dream within a dream within a dream within a ...",
        metadata={"year": 2010, "director": "Christopher Nolan", "rating": 8.2},
    ),
    Document(
        page_content="A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea",
        metadata={"year": 2006, "director": "Satoshi Kon", "rating": 8.6},
    ),
    Document(
        page_content="A bunch of normal-sized women are supremely wholesome and some men pine after them",
        metadata={"year": 2019, "director": "Greta Gerwig", "rating": 8.3},
    ),
    Document(
        page_content="Toys come alive and have a blast doing so",
        metadata={"year": 1995, "genre": "animated"},
    ),
    Document(
        page_content="Three men walk into the Zone, three men walk out of the Zone",
        metadata={
            "year": 1979,
            "director": "Andrei Tarkovsky",
            "genre": "thriller",
            "rating": 9.9,
        },
    ),
]

In [26]:
metadata_field_info = [
    AttributeInfo(
        name="genre",
        description="The genre of the movie. One of ['science fiction', 'comedy', 'drama', 'thriller', 'romance', 'action', 'animated']",
        type="string",
    ),
    AttributeInfo(
        name="year",
        description="The year the movie was released",
        type="integer",
    ),
    AttributeInfo(
        name="director",
        description="The name of the movie director",
        type="string",
    ),
    AttributeInfo(
        name="rating", description="A 1-10 rating for the movie", type="float"
    ),
]

In [27]:
vectordb = Chroma.from_documents(docs, embd_model)

In [28]:
document_content_description = "Brief summarization of a movie"

retriever = SelfQueryRetriever.from_llm(
    llm=llm,
    vectorstore=vectordb,
    document_contents=document_content_description,
    metadata_field_info=metadata_field_info
)

In [30]:
# This example only specifies a filter
retriever.invoke("I want to watch a movie rated higher than 8.5")

[Document(metadata={'year': 2019, 'director': 'Greta Gerwig', 'rating': 8.3}, page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them'),
 Document(metadata={'year': 2010, 'director': 'Christopher Nolan', 'rating': 8.2}, page_content='Leo DiCaprio gets lost in a dream within a dream within a dream within a ...'),
 Document(metadata={'director': 'Andrei Tarkovsky', 'genre': 'thriller', 'rating': 9.9, 'year': 1979}, page_content='Three men walk into the Zone, three men walk out of the Zone'),
 Document(metadata={'rating': 8.6, 'director': 'Satoshi Kon', 'year': 2006}, page_content='A psychologist / detective gets lost in a series of dreams within dreams within dreams and Inception reused the idea')]

In [35]:
# This example specifies a query and a filter
retriever.invoke("Has Greta Gerwig directed any movies about women")

[Document(metadata={'genre': 'animated', 'year': 1995}, page_content='Toys come alive and have a blast doing so'),
 Document(metadata={'director': 'Greta Gerwig', 'year': 2019, 'rating': 8.3}, page_content='A bunch of normal-sized women are supremely wholesome and some men pine after them'),
 Document(metadata={'title': 's8329 final', 'page': 2, 'producer': 'PyPDF', 'author': 'IEEE', 'page_label': '3', 'source': 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/ioch1wsxkfqgfLLgmd-6Rw/langchain-paper.pdf', 'creationdate': '2023-12-31T03:50:13+00:00', 'moddate': '2023-12-31T03:52:06+00:00', 'total_pages': 6, 'creator': 'Microsoft Word'}, page_content="with a friendly and safe welcome. Users can jump in by typing \nWelcome! to your therapy session. I'm here to listen, \nsupport, and guide you through any mental health \nchallenges or concerns you may have. Please feel free \nto share what's on your mind, and we'll work together \nto address your needs. Remember, this is 

## Parent document retriever

In [41]:
from langchain.retrievers import ParentDocumentRetriever
from langchain_text_splitters import CharacterTextSplitter
from langchain.storage import InMemoryStore

In [42]:
# Setup 2 splitters. 1 is big chunk size (parent) and one is small chunk size (child)
parent_splitter = CharacterTextSplitter(chunk_size=2000, chunk_overlap=20, separator="\n")
child_splitter = CharacterTextSplitter(chunk_size=400, chunk_overlap=20, separator="\n")

In [64]:
vectordb = Chroma(
    collection_name="split_parent",
    embedding_function=embd_model
)
store = InMemoryStore()

In [None]:
retriever = ParentDocumentRetriever(
    vectorstore=vectordb,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

In [66]:
retriever.add_documents(chunks_txt)

In [67]:
len(list(store.yield_keys()))

122

In [70]:
sub_docs = vectordb.similarity_search("smoking policy")
sub_docs[0].page_content

'Policy Purpose: The Smoking Policy has been established to provide clear guidance and expectations concerning smoking on company premises. This policy is in place to ensure a safe and healthy'

In [71]:
retrieved_docs = retriever.invoke("smoking policy")
retrieved_docs[0].page_content

'Policy Purpose: The Smoking Policy has been established to provide clear guidance and expectations concerning smoking on company premises. This policy is in place to ensure a safe and healthy'

# Test

In [None]:
# Test 1: Retrieve top 2 results using vector store-backed retriever
retriever = vectordb.as_retriever()
query = "smking policy"
retriever.invoke(query)[:2]

[Document(metadata={'source': 'companypolicies.txt', 'doc_id': 'a544fcbd-943a-4e83-b660-68b5d71093c4'}, page_content='7.\tHealth and Safety Policy'),
 Document(metadata={'doc_id': 'f1a1ed74-eb16-4941-b0c2-80f8c8fa6574', 'source': 'companypolicies.txt'}, page_content='will be made after careful consideration. Every employee is expected to understand and adhere to this policy, contributing to a respectful and productive workplace. Regular reviews will ensure its')]

In [77]:
# Test 2: Self-Querying Retriever for a query
vectordb = Chroma.from_documents(docs, embd_model)

retriever = SelfQueryRetriever.from_llm(
    llm,
    vectordb,
    document_content_description,
    metadata_field_info,
)
retriever.invoke("I want a movie directed by Christopher Nolan")

[Document(metadata={'year': 1995, 'genre': 'animated'}, page_content='Toys come alive and have a blast doing so'),
 Document(metadata={'year': 1995, 'genre': 'animated'}, page_content='Toys come alive and have a blast doing so'),
 Document(metadata={'year': 1995, 'genre': 'animated'}, page_content='Toys come alive and have a blast doing so'),
 Document(metadata={'year': 1995, 'genre': 'animated'}, page_content='Toys come alive and have a blast doing so')]