In [None]:
!pip install langchain_core langchain_openai langchain_chroma rank_bm25

In [None]:
!pip install langchain_community

In [None]:
!pip install pypdf

In [14]:
import os
from google.colab import userdata
os.environ["OPENAI_API_KEY"] =userdata.get('OPENAI_API_KEY')

In [7]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader('/content/build-career-in-ai.pdf')
documents = loader.load()

In [9]:
len(documents)

41

### Imports

In [23]:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
from langchain_chroma import Chroma
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

In [26]:
embeddings = OpenAIEmbeddings()
llm = ChatOpenAI(model="gpt-4o-mini",temperature=0)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=500)

In [27]:
chunks = text_splitter.split_documents(documents)

In [28]:
len(chunks)

432

# Prompt for adding context to the chunks

In [29]:
contextual_retrieval_prompt = ChatPromptTemplate.from_template("""
<document>
{DOCUMENT}
</document>
Here is the chunk we want to situate within the whole document
<chunk>
{CHUNK}
</chunk>
Please give a short succinct context to situate this chunk within the overall document for the purposes of improving search retrieval of the chunk. Answer only with the succinct context and nothing else.
"""
)

In [30]:
def adding_context(prompt, document, chunk):
  content = prompt.format(DOCUMENT=document, CHUNK=chunk)
  response = llm.invoke(content)
  return response.content

In [31]:
contextualized_chunks = []

for chunk in chunks:
  context = adding_context(contextual_retrieval_prompt, documents, chunk)
  contextualized_content = f"{context}\n\n{chunk.page_content}"
  contextualized_chunks.append(Document(page_content=contextualized_content, metadata=chunk.metadata))
  print(f"Added context to chunk no:{len(contextualized_chunks)}")



Added context to chunk no:1
Added context to chunk no:2
Added context to chunk no:3
Added context to chunk no:4
Added context to chunk no:5
Added context to chunk no:6
Added context to chunk no:7
Added context to chunk no:8
Added context to chunk no:9
Added context to chunk no:10
Added context to chunk no:11
Added context to chunk no:12
Added context to chunk no:13
Added context to chunk no:14
Added context to chunk no:15
Added context to chunk no:16
Added context to chunk no:17
Added context to chunk no:18
Added context to chunk no:19
Added context to chunk no:20
Added context to chunk no:21
Added context to chunk no:22
Added context to chunk no:23
Added context to chunk no:24
Added context to chunk no:25
Added context to chunk no:26
Added context to chunk no:27
Added context to chunk no:28
Added context to chunk no:29
Added context to chunk no:30
Added context to chunk no:31
Added context to chunk no:32
Added context to chunk no:33
Added context to chunk no:34
Added context to chunk 

In [36]:
contextualized_chunks[102]

Document(metadata={'source': '/content/build-career-in-ai.pdf', 'page': 10}, page_content='This chunk is part of Chapter 2, "Learning Technical Skills for a Promising AI Career," where the author discusses the importance of building small, consistent habits for effective learning. It emphasizes starting with manageable goals, such as watching a brief educational video daily, to cultivate a routine of continuous study and skill development in the field of AI.\n\nand succeed, rather than start  too big and fail. For example, rather than trying to \nexercise for 30 minutes a day, he recommends aspiring to do just one push-up, and \ndoing it consistently.\nThis approach may be helpful to those of you who want to spend more time studying. \nIf you start by holding yourself accountable for watching, say, 10 seconds of an \neducational video every day — and you do so consistently — the habit of studying daily')

### Vectorizing and saving in vectorstore

In [37]:
from langchain_chroma import Chroma

db = Chroma.from_documents(contextualized_chunks ,embeddings)

In [38]:
from langchain.retrievers import EnsembleRetriever
from langchain_community.retrievers import BM25Retriever

In [39]:
vector_retriever = db.as_retriever()
bm25_retriever = BM25Retriever.from_documents(contextualized_chunks)

In [40]:
ensemble_retriever = EnsembleRetriever(
    retrievers=[bm25_retriever, vector_retriever], weights=[0.5, 0.5]
)

In [49]:
ensemble_retriever.invoke("How to excel in AI?")

[Document(metadata={'source': '/content/build-career-in-ai.pdf', 'page': 12}, page_content='This chunk is from Chapter 3 of the document, titled "Should You Learn Math to Get a Job in AI?" It discusses the importance of math as a foundational skill for a career in AI, emphasizing the need to prioritize specific mathematical knowledge relevant to decision-making in AI roles.\n\nPAGE 13Should you Learn Math to Get a Job in AI? CHAPTER 3\nIs math a foundational skill for AI? It’s always nice to know more math! But there’s so much to \nlearn that, realistically, it’s necessary to prioritize. Here’s how you might go about strengthening \nyour math background.\nTo figure out what’s important to know, I find it useful to ask what you need to know to make \nthe decisions required for the work you want to do. At DeepLearning.AI, we frequently ask,'),
 Document(metadata={'page': 34, 'source': '/content/build-career-in-ai.pdf'}, page_content='This chunk is from Chapter 10, titled "Keys to Buildin