# Quickstart: Querying PDF With Astra and LangChain

### A question-answering demo using Astra DB and LangChain, powered by Vector Search

In [1]:
! pip install -q cassio datasets langchain openai tiktoken

In [2]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [4]:
! pip install PyPDF2



In [5]:
from PyPDF2 import PdfReader

In [6]:
ASTRA_DB_APPLICATION_TOKEN = os.getenv("ASTRA_DB_APPLICATION_TOKEN")

ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")

OPENAI_API_KEY=os.getenv("OPENAI_API_KEY")

In [7]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('aa.pdf')

In [8]:
# from typing_extensions import Concatenate
# read text from pdf
raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [9]:
raw_text

'A P J Abdul Kalam Departing speech Friends, I am delighted to \naddress you all, in the country and those living abroad, after \nworking with you and completing five beautiful and eventful \nyears in Rashtrapati Bhavan. Today, it is indeed a thanks giving \noccasion. I would like to narrate, how I enjoyed every minute of \nmy tenure enriched by the wonderful association from each \none of you, hailing from different walks of life, be it politics, \nscience and technology, academics, arts, literature, business, \njudiciary, administration, local bodies, farming, home makers, \nspecial children, media and above all from the youth and \nstudent community who are the future wealth of our country. \nDuring my interaction at Rashtrapati Bhavan in Delhi and at \nevery state and union territory as well as through my online \ninteractions, I have many unique experiences to share with \nyou, which signify the following important messages: 1. \nAccelerate development : Aspiration of the youth, 2

Initialize the connection to your database:

_(do not worry if you see a few warnings, it's just that the drivers are chatty about negotiating protocol versions with the DB.)_

In [10]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [11]:
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

  llm = OpenAI(openai_api_key=OPENAI_API_KEY)
  embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


Create the LangChain embedding and LLM objects for later usage:

In [12]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

In [13]:
from langchain.text_splitter import CharacterTextSplitter
# We need to split the text using Character Text Split such that it sshould not increse token size
text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

In [14]:
texts[:2]

['A P J Abdul Kalam Departing speech Friends, I am delighted to \naddress you all, in the country and those living abroad, after \nworking with you and completing five beautiful and eventful \nyears in Rashtrapati Bhavan. Today, it is indeed a thanks giving \noccasion. I would like to narrate, how I enjoyed every minute of \nmy tenure enriched by the wonderful association from each \none of you, hailing from different walks of life, be it politics, \nscience and technology, academics, arts, literature, business, \njudiciary, administration, local bodies, farming, home makers, \nspecial children, media and above all from the youth and \nstudent community who are the future wealth of our country. \nDuring my interaction at Rashtrapati Bhavan in Delhi and at',
 'special children, media and above all from the youth and \nstudent community who are the future wealth of our country. \nDuring my interaction at Rashtrapati Bhavan in Delhi and at \nevery state and union territory as well as thro

### Load the dataset into the vector store

In [15]:

astra_vector_store.add_texts(texts[:50])

print("Inserted %i headlines." % len(texts[:50]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 3 headlines.


### Run the QA cycle

Simply run the cells and ask a question -- or `quit` to stop. (you can also stop execution with the "▪" button on the top toolbar)

In [None]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))