In [18]:
from langchain_community.document_loaders.pdf import PyPDFLoader
loader = PyPDFLoader(
    file_path = "Pdfs/Generative_AI.pdf",
)  
document = loader.load()

In [19]:
from langchain_text_splitters.character import RecursiveCharacterTextSplitter

In [20]:
splitter = RecursiveCharacterTextSplitter(chunk_overlap=100, chunk_size=100)
chunks = splitter.split_documents(documents = document)
len(chunks)

12135

In [21]:
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

In [22]:
chunks[0]

Document(metadata={'source': 'Pdfs/Generative_AI.pdf', 'page': 2}, page_content='“Cutting through the clutter, Martin Musiol explains generative AI with')

In [23]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [24]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import FAISS
from uuid import uuid4

index = faiss.IndexFlatL2(1536)

vector_store = FAISS(
    embedding_function=OpenAIEmbeddings(model = "text-embedding-3-small"),
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)

ids = [str(uuid4()) for _ in range(len(chunks))]

In [25]:
chunks[0]

Document(metadata={'source': 'Pdfs/Generative_AI.pdf', 'page': 2}, page_content='“Cutting through the clutter, Martin Musiol explains generative AI with')

In [26]:
ids[0]

'5d9d9373-84a3-4000-8660-424938e70b48'

In [27]:
texts = [doc.page_content for doc in chunks]
metadatas = [doc.metadata for doc in chunks]

In [28]:
texts[0]

'“Cutting through the clutter, Martin Musiol explains generative AI with'

In [29]:
vector_store.add_texts(texts=texts)

['2f89954b-c0df-4544-b187-fe3353a4d62c',
 '5a1db31c-1660-4333-a7af-15ba14bf34bf',
 'e6f8b8c8-f3b1-45a7-9ccb-eacf0d16b8b9',
 '5d177bad-56a0-41f8-8807-3349b5c8643f',
 'b6bc0704-eabf-44ee-a4da-809600f452fc',
 '9ff9a05d-de18-4ee4-9340-ca76dc423a9d',
 '0055ec37-550a-403c-83b1-efdccc6aadb5',
 'd5a6a6b5-3eb6-4ba7-aa40-b701e8cd2b6d',
 'bd9d4b0e-38db-4299-9ec8-fd4260e0e0c0',
 'f62a0b1c-6cff-4799-8c20-a55d41923239',
 '75e54aa3-953f-4b84-bb23-afca5a7d235e',
 'd0744371-f012-479c-ade5-d78e7994f678',
 'ba0a3c81-e9d3-4b14-900e-bfd7e2ffee1e',
 '4d24e5ac-3994-4894-92f0-e6942c9e5a39',
 'a2e1b64c-0da8-49b3-85d3-f8d5abcebab1',
 '089a9d15-1ac8-48c0-abd8-ccdf72ac5a93',
 'bcbd0b54-f07a-4ce4-8472-df4462b31c01',
 '46ac4cfc-0c2c-4110-ba7c-c95cbb9b173f',
 'add4cf94-2f20-4f81-9d9c-b983590e2cc1',
 'f8fdf392-f1bf-4615-b3ae-a81d56665435',
 '1578c312-eaeb-4a86-b1a9-488d2b472053',
 '925204ac-1ef2-4b4d-a816-c038c159b011',
 '307966c5-f9a4-4e3a-90bb-0fefe5846983',
 '35108fbd-311b-4f9b-af3f-b21ca46f2d4c',
 'bf87a744-174f-

In [30]:
vector_store.add_documents(documents=chunks, ids=ids)

['5d9d9373-84a3-4000-8660-424938e70b48',
 '91c6fbe9-e7a9-4fe4-ac12-7cd9ce851557',
 'bd464362-562c-41ca-86ec-69d6305c5135',
 '29b41e94-698b-48b8-82ff-1b0346404e41',
 '30fc993d-9ea3-4336-b445-2df4560aeeba',
 '2290ef48-3f2e-41e4-a4ac-92c70796ee61',
 '58463eb4-764e-44cf-aa32-56e1d93c7957',
 'bb05afcb-b8ad-4b63-84d2-9bfa8df08bc0',
 'cd49c544-e018-4c7b-bbec-0fa04b59d419',
 '536bc669-0e37-4e96-a200-37ebcc206bce',
 'aa771883-6873-4894-9759-56369aef0be0',
 'fba8dd87-c6da-418c-87bb-8b3e9389f4cb',
 '867af92a-3ef5-414d-b4af-dfa146563c87',
 '2323929f-90fd-4dd8-8273-528e895a04e2',
 'cc125502-aca2-4dd8-8b85-02e21fe6d4ba',
 '7d045368-d0eb-4f0a-9290-33937d0dd900',
 '8eeb0aa9-81c4-4fbf-afa7-bab37a145a6f',
 '01dc1182-197e-4039-8d55-e0efa1d925a6',
 'a04dac47-c3d8-48c3-9f0e-a638ecf15294',
 'a1c3d526-8d03-41a0-9604-165e909af60e',
 '8c36a61e-2d5d-4464-9fce-fb28f73ceff1',
 '160302aa-337a-412d-a668-78d3f26e263f',
 'b0b0dac4-4be7-4ed9-aa9b-afde3a7ca416',
 'c87f5253-d2a5-47d4-96f3-4d3ff292b46c',
 '2bd5c01d-ec3c-

In [31]:
results = vector_store.similarity_search(
    "What is GenAI?",
    k=2
)
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* practical applications. The emergence of GenAI demonstrates [{'source': 'Pdfs/Generative_AI.pdf', 'page': 285}]
* practical applications. The emergence of GenAI demonstrates [{}]


In [32]:
retriever = vector_store.as_retriever(
                search_type="mmr",
                search_kwargs={"k": 1, "fetch_k": 2, "lambda_mult": 0.5},
            )
result_rtr = retriever.invoke("What is GenAI?")
for res in results:
    print(f"* {res.page_content} [{res.metadata}]")

* practical applications. The emergence of GenAI demonstrates [{'source': 'Pdfs/Generative_AI.pdf', 'page': 285}]
* practical applications. The emergence of GenAI demonstrates [{}]


In [33]:
print(results)

[Document(metadata={'source': 'Pdfs/Generative_AI.pdf', 'page': 285}, page_content='practical applications. The emergence of GenAI demonstrates'), Document(metadata={}, page_content='practical applications. The emergence of GenAI demonstrates')]


In [34]:
from langchain_openai import ChatOpenAI
from langchain_core.messages import SystemMessage, HumanMessage

openai_model = ChatOpenAI(model = "gpt-3.5-turbo", temperature = 0)


messages = [
    SystemMessage(content=f"Answers the Following Question: based on the context given in the list {results[0]}, if the prompt doesn't have context don't use your own knowledge"),
    HumanMessage(content="What is GenAI?"),
]

print(messages)
result = openai_model.invoke(messages)
print(result)
print(f"Answer: {result.content}")

[SystemMessage(content="Answers the Following Question: based on the context given in the list page_content='practical applications. The emergence of GenAI demonstrates' metadata={'source': 'Pdfs/Generative_AI.pdf', 'page': 285}, if the prompt doesn't have context don't use your own knowledge", additional_kwargs={}, response_metadata={}), HumanMessage(content='What is GenAI?', additional_kwargs={}, response_metadata={})]
content='GenAI refers to Generative Artificial Intelligence, which is a technology that can create new content, such as images, text, or music, based on patterns and data it has been trained on. It demonstrates practical applications in various fields, showcasing its ability to generate new and creative outputs.' additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 57, 'prompt_tokens': 75, 'total_tokens': 132, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_predict