In [1]:
import os

In [2]:
## setting LANGSMITH API KEY in environment variable for tracking

os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
os.environ["LANGSMITH_TRACING"] = "true"

In [3]:
## setting PINECONE API KEY nad PINECONE ENVIRONMENT in the environment

os.environ["PINECONE_API_KEY"] = os.getenv("PINECONE_API_KEY")
os.environ["PINECONE_API_ENV"] = "us-east-1"

In [None]:
## importing required libraries

from langchain.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [5]:
## loading pdf data using PyPDFDirectoryLoader

loader = PyPDFDirectoryLoader("/Users/yash/Library/mcqgen/app/Assistant/pdf_docs")
data = loader.load()

In [6]:
data

[Document(metadata={'producer': 'Skia/PDF m137', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36', 'creationdate': '2025-06-11T03:55:14+00:00', 'title': '2025 ICC Champions Trophy - Wikipedia', 'moddate': '2025-06-11T03:55:14+00:00', 'source': '/Users/yash/Library/mcqgen/app/Assistant/pdf_docs/2025ct.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1'}, page_content="2025 ICC Champions Trophy\nDates 19 February – 9 March\n2025\nAdministrator(s) International Cricket\nCouncil\nCricket format One Day International\nTournament\nformat(s)\nRound-robin and single-\nelimination\nHost(s) Pakistan\nUnited Arab Emirates[ a ] \nChampions\n \xa0India (3rd title)\nRunners-up\n \xa0New Zealand\nParticipants 8\nMatches 15\nPlayer of the\nseries\n Rachin Ravindra\nMost runs\n  Rachin Ravindra (263)\nMost wickets\n  Matt Henry (10)\nOfﬁcial website icc-cricket.com (https://ww\nw.icc-cricket.com/tourname\nnts/champions-

In [7]:
## initializing RecursiveCharacterTextSplitter for chunking the data
 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=900, chunk_overlap=100)

In [8]:

## applying text_splitter to data
 
text_chunks = text_splitter.split_documents(data)

In [9]:
text_chunks

[Document(metadata={'producer': 'Skia/PDF m137', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36', 'creationdate': '2025-06-11T03:55:14+00:00', 'title': '2025 ICC Champions Trophy - Wikipedia', 'moddate': '2025-06-11T03:55:14+00:00', 'source': '/Users/yash/Library/mcqgen/app/Assistant/pdf_docs/2025ct.pdf', 'total_pages': 16, 'page': 0, 'page_label': '1'}, page_content='2025 ICC Champions Trophy\nDates 19 February – 9 March\n2025\nAdministrator(s) International Cricket\nCouncil\nCricket format One Day International\nTournament\nformat(s)\nRound-robin and single-\nelimination\nHost(s) Pakistan\nUnited Arab Emirates[ a ] \nChampions\n \xa0India (3rd title)\nRunners-up\n \xa0New Zealand\nParticipants 8\nMatches 15\nPlayer of the\nseries\n Rachin Ravindra\nMost runs\n  Rachin Ravindra (263)\nMost wickets\n  Matt Henry (10)\nOfﬁcial website icc-cricket.com (https://ww\nw.icc-cricket.com/tourname\nnts/champions-

In [10]:

len(text_chunks)

137

In [11]:
## initializing embeddings
 
from langchain_google_genai import GoogleGenerativeAIEmbeddings

gemini_embeddings = GoogleGenerativeAIEmbeddings(
    model="models/embedding-001"
)

In [12]:
## using embeddings on random sentance

result = gemini_embeddings.embed_query("How are you!")

In [13]:
len(result)

768

In [12]:
from langchain_pinecone import PineconeVectorStore

vectorstore = PineconeVectorStore(index_name="smart-assistant", embedding=gemini_embeddings)

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
vectorstore_from_docs = PineconeVectorStore.from_documents(
        text_chunks,
        index_name="smart-assistant",
        embedding=gemini_embeddings
    )

In [14]:
query = "Who won ipl in 2025?"
vectorstore.similarity_search(query)

[Document(id='1abd7740-0345-4495-9adb-91ae9aa0178f', metadata={'creationdate': '2025-06-30T11:23:18+00:00', 'creator': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/137.0.0.0 Safari/537.36', 'moddate': '2025-06-30T11:23:18+00:00', 'page': 17.0, 'page_label': '18', 'producer': 'Skia/PDF m137', 'source': '/Users/yash/Library/mcqgen/app/Assistant/pdf_docs/ipl2025.pdf', 'title': '2025 Indian Premier League - Wikipedia', 'total_pages': 19.0}, page_content='Punjab Kings qualiﬁed for their second IPL ﬁnal after 2014.[100]\nPunjab Kings won the toss and elected to ﬁeld.\nThis match was originally scheduled for 25 May at the Eden Gardens, before being rescheduled.[86][18]\nRoyal Challengers Bengaluru won their maiden title after 18 years.[101]\nMost runs[102]\nRuns Player Team\n759 Sai Sudharsan Gujarat Titans\n717 Suryakumar Yadav Mumbai Indians\n657 Virat Kohli Royal Challengers Bengaluru\n650 Shubman Gill Gujarat Titans\n627 Mitchell Marsh Luc