<a href="https://colab.research.google.com/github/vikaspathak0911/GenAI/blob/main/PDFQuery_LangChain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
!pip install -q cassio datasets langchain langchain-google-genai tiktoken
!pip install -U langchain-community



In [2]:
#Langchain components to use
from langchain.vectorstores import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.embeddings import OpenAIEmbeddings

# Support fo dataset retrieval with Hugging Face
from datasets import load_dataset

#With CassIO, the engine powering the Astra DB integration in LangChain,
# We will also intitialize the DB connection:
import cassio

In [3]:
!pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [4]:
from PyPDF2 import PdfReader

In [5]:
import google.generativeai as genai
from google.colab import userdata

ASTRA_DB_APPLICATION_TOKEN = userdata.get("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = userdata.get("ASTRA_DB_ID")

GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

In [6]:
llm = ChatGoogleGenerativeAI(model="gemini-2.5-pro", google_api_key=GOOGLE_API_KEY)

In [8]:
pdfreader = PdfReader('/content/Ikigai - The Japanese secret to a long and happy life.pdf')

In [9]:
from typing_extensions import Concatenate
# read text form pdf

raw_text = ''
for i, page in enumerate(pdfreader.pages):
  content = page.extract_text()
  if content:
    raw_text += content

In [10]:
raw_text

'PENGUIN\tBOOKS\nAn\timprint\tof\tPenguin\tRandom\tHouse\tLLC\n375\tHudson\tStreet\nNew\tYork,\tNew\tYork\t10014\npenguin.com\nCopyright\t©\t2016\tby\tHéctor\tGarcía\tand\tFrancesc\tMiralles\tTranslation\tcopyright\t©\t2017\tby\tPenguin\tRandom\tHouse\tLLC\nPenguin\tsupports\tcopyright.\tCopyright\tfuels\tcreativity,\tencourages\tdiverse\tvoices,\tpromotes\tfree\tspeech,\tand\tcreates\ta\tvibrant\tculture.\nThank\tyou\tfor\tbuying\tan\tauthorized\tedition\tof\tthis\tbook\tand\tfor\tcomplying\twith\tcopyright\tlaws\tby\tnot\treproducing,\tscanning,\tor\ndistributing\tany\tpart\tof\tit\tin\tany\tform\twithout\tpermission.\tYou\tare\tsupporting\twriters\tand\tallowing\tPenguin\tto\tcontinue\tto\tpublish\tbooks\nfor\tevery\treader.\nOriginally\tpublished\tin\tSpanish\tas\t\nIkigai:\tLos\tsecretos\tde\tJapón\tpara\tuna\tvida\tlarga\ty\tfeliz\n\tby\tEdiciones\tUrano,\tBarcelona.\nIllustration\t\nhere\n:\tAbbie/Shutterstock\tAll\tother\tillustrations\tcopyright\t©\t2016\tby\tMarisa\tMartínez\

In [11]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [12]:
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", google_api_key=GOOGLE_API_KEY)
embedding = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=GOOGLE_API_KEY)

In [13]:
astra_vector_Store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace = None,
)

In [14]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it should not increase token size
text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap=200,
    length_function=len,
)
texts = text_splitter.split_text(raw_text)

In [15]:
texts[:50]

['PENGUIN\tBOOKS\nAn\timprint\tof\tPenguin\tRandom\tHouse\tLLC\n375\tHudson\tStreet\nNew\tYork,\tNew\tYork\t10014\npenguin.com\nCopyright\t©\t2016\tby\tHéctor\tGarcía\tand\tFrancesc\tMiralles\tTranslation\tcopyright\t©\t2017\tby\tPenguin\tRandom\tHouse\tLLC\nPenguin\tsupports\tcopyright.\tCopyright\tfuels\tcreativity,\tencourages\tdiverse\tvoices,\tpromotes\tfree\tspeech,\tand\tcreates\ta\tvibrant\tculture.\nThank\tyou\tfor\tbuying\tan\tauthorized\tedition\tof\tthis\tbook\tand\tfor\tcomplying\twith\tcopyright\tlaws\tby\tnot\treproducing,\tscanning,\tor\ndistributing\tany\tpart\tof\tit\tin\tany\tform\twithout\tpermission.\tYou\tare\tsupporting\twriters\tand\tallowing\tPenguin\tto\tcontinue\tto\tpublish\tbooks\nfor\tevery\treader.\nOriginally\tpublished\tin\tSpanish\tas\t\nIkigai:\tLos\tsecretos\tde\tJapón\tpara\tuna\tvida\tlarga\ty\tfeliz\n\tby\tEdiciones\tUrano,\tBarcelona.\nIllustration\t\nhere',
 'for\tevery\treader.\nOriginally\tpublished\tin\tSpanish\tas\t\nIkigai:\tLos\tsecretos\t

In [16]:
astra_vector_Store.add_texts(texts[:50])

print("Inserted %i headlines." % len(texts[:50]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_Store)

Inserted 50 headlines.



# Run the QA cycle

Run the cell and ask question -- or quit to stop

In [17]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()
    if query_text.lower() == 'quit':
        break
    if query_text == "":
        continue
    first_question = False
    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_index.vectorstore.similarity_search_with_score(query_text, k=4):
        print("   [%0.4f]\"%s ...\"" % (score, doc.page_content.replace('\n', ' ').replace('\r', '').strip()[:]))



Enter your question (or type 'quit' to exit): what is The 80 percent secret?

QUESTION: "what is The 80 percent secret?"




ANSWER: "The 80 percent secret refers to the Japanese saying "Hara hachi bu," which translates to "Fill your belly to 80 percent."  It's a piece of ancient wisdom advising against eating until completely full.  Okinawans, for example, follow this practice, stopping their meals when their stomachs reach 80 percent capacity."
FIRST DOCUMENTS BY RELEVANCE:




   [0.8781]"relations. Members	of	these	communities	manage	their	time	well	in	order	to	reduce stress,	consume	little	meat	or	processed	foods,	and	drink	alcohol	in	moderation. 1 They	don’t	do	strenuous	exercise,	but	they	do	move	every	day,	taking	walks and	working	in	their	vegetable	gardens.	People	in	the	Blue	Zones	would	rather walk	than	drive.	Gardening,	which	involves	daily	low-intensity	movement,	is	a practice	almost	all	of	them	have	in	common. The	80	percent	secret One	of	the	most	common	sayings	in	Japan	is	“Hara	hachi	bu,”	which	is	repeated before	or	after	eating	and	means	something	like	“Fill	your	belly	to	80	percent.” Ancient	wisdom	advises	against	eating	until	we	are	full.	This	is	why	Okinawans stop	eating	when	they	feel	their	stomachs	reach	80	percent	of	their	capacity, ..."
   [0.8781]"relations. Members	of	these	communities	manage	their	time	well	in	order	to	reduce stress,	consume	little	meat	or	processed	foods,	and	drink	alcohol	in	moderation. 1 They	don’t	do	strenuous	exer