In [1]:
import os
import streamlit as st
from langchain_google_genai import GoogleGenerativeAIEmbeddings,GoogleGenerativeAI
os.environ["api_key"] = st.secrets["secrets"]["api_key"]

embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=os.environ["api_key"])
model = GoogleGenerativeAI(temperature=0.0,
            model="gemini-pro",
            google_api_key=os.environ["api_key"],
            
        )

  from .autonotebook import tqdm as notebook_tqdm


In [2]:

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma

from langchain.schema import Document

# Load blog post
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("1st_Edition_Catalogue_22_23_v01.pdf")
data = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=200)
docs = text_splitter.split_documents(data)


In [3]:
vector1 = embedding.embed_query("who is the Chief Operating Officer of human resources at the university?")
vector2 = embedding.embed_query("What is the longest bachelor's degree in terms of number of credit hours?")
vector3 = embedding.embed_query("where is the location of the uni")

data_vectors = [embedding.embed_query(doc.page_content) for doc in docs]
print(len(data_vectors))

1821


In [4]:
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt
import numpy as np

cosine_sims_1 = [cosine_similarity([vector1], [data_vector])[0][0] for data_vector in data_vectors]
cosine_sims_2 = [cosine_similarity([vector2], [data_vector])[0][0] for data_vector in data_vectors]
cosine_sims_3 = [cosine_similarity([vector3], [data_vector])[0][0] for data_vector in data_vectors]

x = np.arange(len(data_vectors))

plt.scatter(x, cosine_sims_1, label='officer', alpha=0.7)
plt.scatter(x, cosine_sims_2, label='degree', alpha=0.7)
plt.scatter(x, cosine_sims_3, label='location', alpha=0.7)

plt.ylabel('Cosine Similarity')
plt.title('Consine Similarity between query and data vectors')
plt.legend()

plt.show()

  plt.show()


In [11]:

from langchain.storage import InMemoryStore
from langchain.retrievers import ParentDocumentRetriever

child_splitter = RecursiveCharacterTextSplitter(chunk_size=400, chunk_overlap=20)
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=750, chunk_overlap=20)
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=embedding
)
store = InMemoryStore()
retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter
)

In [12]:
retriever.add_documents(docs, ids=None)

In [28]:
vectorstore.similarity_search("where is the location of the uni", k=5)


[Document(page_content='University Campus & Map Location  ................................ ................................ ................................ .... 391', metadata={'doc_id': '6277294b-42e4-4249-854c-f236bedc996c', 'page': 7, 'source': '1st_Edition_Catalogue_22_23_v01.pdf'}),
 Document(page_content='Management  Faculty of Management', metadata={'doc_id': '750cf7fc-5157-40a0-b520-e3c55aeaf9ec', 'page': 89, 'source': '1st_Edition_Catalogue_22_23_v01.pdf'}),
 Document(page_content='Michigan, USA . It offers a range of undergraduate and graduate programs', metadata={'doc_id': '398a1546-4078-4a51-be24-7e0a20d44748', 'page': 231, 'source': '1st_Edition_Catalogue_22_23_v01.pdf'}),
 Document(page_content='walking distance from the University campus, ensuring an easy commute from anywhere on  the network. \nThere are also a wide variety of shops and restaurants nearby . \nAcademic Facilities  \nThe University’s classrooms, laboratories and studios are fully furnished and well -equi

In [29]:
retriever.get_relevant_documents("where is the location of the uni")


[Document(page_content='University Campus & Map Location  ................................ ................................ ................................ .... 391', metadata={'doc_id': '6277294b-42e4-4249-854c-f236bedc996c', 'page': 7, 'source': '1st_Edition_Catalogue_22_23_v01.pdf'}),
 Document(page_content='Management  Faculty of Management', metadata={'doc_id': '750cf7fc-5157-40a0-b520-e3c55aeaf9ec', 'page': 89, 'source': '1st_Edition_Catalogue_22_23_v01.pdf'}),
 Document(page_content='Michigan, USA . It offers a range of undergraduate and graduate programs', metadata={'doc_id': '398a1546-4078-4a51-be24-7e0a20d44748', 'page': 231, 'source': '1st_Edition_Catalogue_22_23_v01.pdf'}),
 Document(page_content='walking distance from the University campus, ensuring an easy commute from anywhere on  the network. \nThere are also a wide variety of shops and restaurants nearby . \nAcademic Facilities  \nThe University’s classrooms, laboratories and studios are fully furnished and well -equi

In [15]:
retriever.invoke("where is the location of the uni")

[Document(page_content='HRM  210 Human Resource Management  MGT -202 3', metadata={'source': '1st_Edition_Catalogue_22_23_v01.pdf', 'page': 130}),
 Document(page_content='HRM  210 Human Resource Management  MGT -202 3', metadata={'source': '1st_Edition_Catalogue_22_23_v01.pdf', 'page': 133}),
 Document(page_content='Canadian University Dubai Catalogue 22-23 \nV. 1.0|  1st Edition   Page  66 of 391 \n \n \nArticle 7 – Requirements for remaining in Student council  \n• Student Council members are required to attend all Student council  scheduled meetings; \nmembers should inform the President of the Student Council for a non -attendance;  \n• Members of the Student council  are bound to their duties, and responsibilities;  \n• Members of the Student council  must maintain the required CGPA.  \nArticle 8 – Resignation from Office  \nMembers of the Student council  wanting to resign from office must submit a written letter of resignation to \nthe University Senate and the Dean of Student A

### MultiQueryRetriever

Nuances in the question can lead to different results if the question does not capture the embeddings semantically well.
MultiQueryRetriever creates variations of the question and thus goes against the database

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.retrievers.multi_query import MultiQueryRetriever

llm = ChatOpenAI(
        temperature=0,
        max_tokens=800,
        model_kwargs={"top_p": 0, "frequency_penalty": 0, "presence_penalty": 0},
    )


retriever = MultiQueryRetriever.from_llm(
    retriever=vectorstore.as_retriever(), llm=llm
)

In [None]:
unique_docs = retriever.get_relevant_documents("What is the name of the dog school?")
len(unique_docs)

In [None]:
from typing import List

from langchain.chains import LLMChain
from langchain.output_parsers import PydanticOutputParser
from langchain.prompts import PromptTemplate
from pydantic import BaseModel, Field


class LineList(BaseModel):
    lines: List[str] = Field(description="Lines of text")


class LineListOutputParser(PydanticOutputParser):
    def __init__(self) -> None:
        super().__init__(pydantic_object=LineList)

    def parse(self, text: str) -> LineList:
        lines = text.strip().split("\n")
        return LineList(lines=lines)


output_parser = LineListOutputParser()

QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from a vector
    database. By generating multiple perspectives on the user question, your goal is to help
    the user overcome some of the limitations of the distance-based similarity search.
    Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

llm_chain = LLMChain(llm=llm, prompt=QUERY_PROMPT, output_parser=output_parser)

In [None]:
llm_chain.invoke("What is the name of the dog school?")

In [None]:
question = "What is the name of the dog school?"


### Contextual Compression

To use the Contextual Compression Retriever, you need:

    a basic retriever
    a document compressor

The Contextual Compression Retriever passes queries to the Base Retriever, takes the source documents and forwards them to the Document Compressor. The document compressor takes a list of documents and shortens them by reducing the content of documents or omitting documents altogether.

In [16]:
vectorstore = Chroma(
    collection_name="full_documents", embedding_function=embedding
)
vectorstore.add_documents(docs)
retriever = vectorstore.as_retriever()

In [18]:
retriever.get_relevant_documents(query="where is the location of the uni")

[Document(page_content='University Campus & Map Location  ................................ ................................ ................................ .... 391', metadata={'doc_id': '6277294b-42e4-4249-854c-f236bedc996c', 'page': 7, 'source': '1st_Edition_Catalogue_22_23_v01.pdf'}),
 Document(page_content='Management  Faculty of Management', metadata={'doc_id': '750cf7fc-5157-40a0-b520-e3c55aeaf9ec', 'page': 89, 'source': '1st_Edition_Catalogue_22_23_v01.pdf'}),
 Document(page_content='Michigan, USA . It offers a range of undergraduate and graduate programs', metadata={'doc_id': '398a1546-4078-4a51-be24-7e0a20d44748', 'page': 231, 'source': '1st_Edition_Catalogue_22_23_v01.pdf'}),
 Document(page_content='walking distance from the University campus, ensuring an easy commute from anywhere on  the network. \nThere are also a wide variety of shops and restaurants nearby . \nAcademic Facilities  \nThe University’s classrooms, laboratories and studios are fully furnished and well -equi

In [21]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

compressor = LLMChainExtractor.from_llm(model)
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=retriever)
def pretty_print_docs(docs):
    print(f"\n{'-' * 100}\n".join([f"Document {i+1}:\n" + d.page_content for i, d in enumerate(docs)]))
compressed_docs = compression_retriever.get_relevant_documents(query="where is the location of the uni")
pretty_print_docs(compressed_docs)



Document 1:
University Campus & Map Location


In [23]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers.document_compressors import EmbeddingsFilter

embeddings_filter = EmbeddingsFilter(embeddings=embedding, similarity_threshold=0.5)
compression_retriever = ContextualCompressionRetriever(base_compressor=embeddings_filter, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents(query="where is the location of the uni")
pretty_print_docs(compressed_docs)

Document 1:
University Campus & Map Location  ................................ ................................ ................................ .... 391
----------------------------------------------------------------------------------------------------
Document 2:
Management  Faculty of Management
----------------------------------------------------------------------------------------------------
Document 3:
Michigan, USA . It offers a range of undergraduate and graduate programs
----------------------------------------------------------------------------------------------------
Document 4:
walking distance from the University campus, ensuring an easy commute from anywhere on  the network. 
There are also a wide variety of shops and restaurants nearby . 
Academic Facilities  
The University’s classrooms, laboratories and studios are fully furnished and well -equipped with the latest


In [25]:
from langchain.document_transformers import EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain.text_splitter import CharacterTextSplitter

splitter = CharacterTextSplitter(chunk_size=300, chunk_overlap=0, separator=". ")
redundant_filter = EmbeddingsRedundantFilter(embeddings=embedding)
relevant_filter = EmbeddingsFilter(embeddings=embedding, similarity_threshold=0.76)
pipeline_compressor = DocumentCompressorPipeline(
    transformers=[splitter, redundant_filter, relevant_filter]
)

compression_retriever = ContextualCompressionRetriever(base_compressor=pipeline_compressor, base_retriever=retriever)

compressed_docs = compression_retriever.get_relevant_documents(query="where is the location of the uni")

pretty_print_docs(compressed_docs)

Document 1:
University Campus & Map Location  ................................ ................................ ................................ .... 391
----------------------------------------------------------------------------------------------------
Document 2:
Management  Faculty of Management
----------------------------------------------------------------------------------------------------
Document 3:
Michigan, USA . It offers a range of undergraduate and graduate programs
----------------------------------------------------------------------------------------------------
Document 4:
walking distance from the University campus, ensuring an easy commute from anywhere on  the network. 
There are also a wide variety of shops and restaurants nearby . 
Academic Facilities  
The University’s classrooms, laboratories and studios are fully furnished and well -equipped with the latest


### Ensemble Retriever

In [27]:
from langchain.retrievers import BM25Retriever, EnsembleRetriever


bm25_retriever = BM25Retriever.from_documents(docs)
bm25_retriever.k = 2

chroma_vectorstore = Chroma.from_documents(docs, embedding)
chroma_retriever = chroma_vectorstore.as_retriever()

ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, chroma_retriever], weights=[0.5, 0.5]
)

In [30]:
docs = ensemble_retriever.get_relevant_documents(query="where is the location of the uni")
docs

[Document(page_content='Second World Wars; the end of the Cold War; the age of Globalizatio n; uni -polar world system; 9/11 and its \nrepercussions; the international financial crisis; Arab world developments. Prerequisite : MCM -410 \n \nMCM 430       Media Internship  \n \nStudents spend six weeks of continuous training in media institutions relevant to their area of specialty where', metadata={'source': '1st_Edition_Catalogue_22_23_v01.pdf', 'page': 334}),
 Document(page_content='s \n \n \n \n \n \nUniversity Catalogue  \n2022/2023', metadata={'page': 0, 'source': '1st_Edition_Catalogue_22_23_v01.pdf'}),
 Document(page_content='and Masters  programs.  \n7. Business -District Location  \nWe have a convenient downtown location in the heart of Dubai’s business district, with state of the art \nacademic and recreational facilities.  \n8. Student -Centered Approach  \nCUD is a student -centered university, where we value student success above all else.  \n9. Research Opportunities  \nOu

### Self-Querying retriever

A self-querying retriever is a retriever that, as the name suggests, has the ability to 
the ability to query itself. More precisely, any natural language query,
 the retriever uses an LLM chain for query construction to write a structured query
 structured query and then applies this structured query to the underlying 
VectorStore. This allows the retriever to not only use the query entered by the user 
query for the semantic similarity comparison with the content of the stored 
documents, but also apply filters from the user query to the metadata of the stored 
metadata of the stored documents and execute these filters.

In [None]:
from langchain.schema import Document
from langchain.vectorstores import Chroma

docs = [
    Document(
        page_content="Bello-Basistraining offers a comprehensive foundation for dog obedience, focusing on basic commands and socialization.",
        metadata={"type": "Basic Training", "feature": "Foundational Skills", "price": "Affordable"},
    ),
    Document(
        page_content="Pfote-Agilitykurs provides a fun and energetic way to keep dogs fit and mentally stimulated through obstacle courses.",
        metadata={"type": "Agility Training", "feature": "Physical Fitness", "price": "Moderate"},
    ),
    Document(
        page_content="Wuff-Verhaltensberatung specializes in addressing behavioral issues, offering tailored strategies for each dog.",
        metadata={"type": "Behavioral Consultation", "feature": "Customized Solutions", "price": "Premium"},
    ),
    Document(
        page_content="Schwanzwedeln-Therapiehundausbildung prepares dogs for roles in therapeutic and support settings, focusing on empathy and gentleness.",
        metadata={"type": "Therapy Dog Training", "feature": "Emotional Support", "price": "High"},
    ),
    Document(
        page_content="Schnüffler-Suchhundetraining trains dogs in scent detection, useful for search and rescue operations.",
        metadata={"type": "Search and Rescue Training", "feature": "Advanced Skills", "price": "Variable"},
    ),
    Document(
        page_content="Hunde-Haftpflichtversicherung offers comprehensive coverage for potential damages or injuries caused by your dog.",
        metadata={"type": "Dog Liability Insurance", "feature": "Financial Protection", "price": "Varies"},
    ),
]

vectorstore = Chroma.from_documents(docs, embedding)


In [None]:
from langchain.vectorstores import Chroma
from langchain.chains.query_constructor.base import AttributeInfo
from langchain.retrievers.self_query.base import SelfQueryRetriever

metadata_field_info = [
    AttributeInfo(
        name="type",
        description="The type of dog training service (e.g., Basic Training, Agility Training, Behavioral Consultation)",
        type="string",
    ),
    AttributeInfo(
        name="feature",
        description="Special features or benefits of the service",
        type="string",
    ),
    AttributeInfo(
        name="price",
        description="Price category of the service (e.g., Affordable, Moderate, Premium)",
        type="string",
    ),
]

document_content_description = "Description of a dog training service"
retriever = SelfQueryRetriever.from_llm(
    llm,
    vectorstore,
    document_content_description,
    metadata_field_info,
)


In [None]:
retriever.invoke("What Premium priced trainings do you offer?")

### Time-weighted vector store retriever

In [None]:
import faiss

from datetime import datetime, timedelta
from langchain.docstore import InMemoryDocstore
from langchain.embeddings import OpenAIEmbeddings
from langchain.retrievers import TimeWeightedVectorStoreRetriever
from langchain.schema import Document
from langchain.vectorstores import FAISS

In [None]:

# decay_rate = .0000000000000000000000001
decay_rate = .999

embedding_size = 1536
index = faiss.IndexFlatL2(embedding_size)
vectorstore = FAISS(embedding, index, InMemoryDocstore({}), {})
retriever = TimeWeightedVectorStoreRetriever(vectorstore=vectorstore, decay_rate=decay_rate, k=1)

In [None]:
yesterday = datetime.now() - timedelta(days=1)
retriever.add_documents([Document(page_content="hello world", metadata={"last_accessed_at": yesterday})])
retriever.add_documents([Document(page_content="hello foo")])

In [None]:
retriever.get_relevant_documents("hello world")