In [None]:
! pip install -q cassio datasets langchain openai tiktoken


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
! pip install PyPDF2

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
Installing collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:
# LangChain components to use
from langchain.vectorstores.cassandra import Cassandra
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.llms import OpenAI
from langchain.embeddings import OpenAIEmbeddings
from PyPDF2 import PdfReader

# Support for dataset retrieval with Hugging Face
from datasets import load_dataset

# With CassIO, the engine powering the Astra DB integration in LangChain,
# you will also initialize the DB connection:
import cassio

import os
from dotenv import load_dotenv
load_dotenv()

True

In [7]:
# configurations
ASTRA_DB_APPLICATION_TOKEN =os.getenv("ASTRA_DB_APPLICATION_TOKEN")
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [8]:
# provide the path of  pdf file/files.
pdfreader = PdfReader('course-resources/2020_budget_speech.pdf')

In [9]:
from typing_extensions import Concatenate

raw_text = ''
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

raw_text

'Better finances,\nbetter lives\nThe Honourable Paul Martin, P .C., M.P .\nMinister of Finance\nFebruary 28, 2000\nDepartment of Finance\nCanadaMinistère des Finances\nCanadaspeeche•good  2/27/00  11:08 AM  Page 1© Her Majesty the Queen in Right of Canada (2000)\nAll rights reserved\nAll requests for permission to reproduce these documents\nor any part thereof shall be addressed to Public Works\nand Government Services Canada.\nAvailable from the \nDepartment of Finance Canada Distribution Centre \n300 Laurier Avenue West, P1 West Tower\nOttawa, Canada  K1A 0G5\nTel: (613) 995-2855\nFax: (613) 996-0518\nand from participating bookstores.\nAlso on the Internet at:\nhttp://www.fin.gc.ca/ \nCette publication est également disponible en français.\nCat. No.: F1-23/2000-2E \nISBN 0-662-28617-0speeche•good  2/27/00  11:08 AM  Page 2Introduction\nMr. Speaker, before I begin, I want to express the Government’s\nappreciation to the Standing Committee on Finance and the manycommittees of caucus f

In [10]:
# initialize the connect to the database
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [None]:
# create langchain embeddings and llm objects and vector store
llm = OpenAI(openai_api_key=OPENAI_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None,
)

  llm = OpenAI(openai_api_key=OPENAI_API_KEY)
  embedding = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


In [12]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator = "\n",
    chunk_size = 800,
    chunk_overlap  = 200,
    length_function = len,
)
texts = text_splitter.split_text(raw_text)

texts[:50]

['Better finances,\nbetter lives\nThe Honourable Paul Martin, P .C., M.P .\nMinister of Finance\nFebruary 28, 2000\nDepartment of Finance\nCanadaMinistère des Finances\nCanadaspeeche•good  2/27/00  11:08 AM  Page 1© Her Majesty the Queen in Right of Canada (2000)\nAll rights reserved\nAll requests for permission to reproduce these documents\nor any part thereof shall be addressed to Public Works\nand Government Services Canada.\nAvailable from the \nDepartment of Finance Canada Distribution Centre \n300 Laurier Avenue West, P1 West Tower\nOttawa, Canada  K1A 0G5\nTel: (613) 995-2855\nFax: (613) 996-0518\nand from participating bookstores.\nAlso on the Internet at:\nhttp://www.fin.gc.ca/ \nCette publication est également disponible en français.\nCat. No.: F1-23/2000-2E',
 'Fax: (613) 996-0518\nand from participating bookstores.\nAlso on the Internet at:\nhttp://www.fin.gc.ca/ \nCette publication est également disponible en français.\nCat. No.: F1-23/2000-2E \nISBN 0-662-28617-0speeche•g

In [13]:
# insert the data into vector store - only top 50 chunks

astra_vector_store.add_texts(texts[:50])
print("Inserted %i headlines." % len(texts[:50]))

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Inserted 50 headlines.


In [15]:
# run Q&A cycle

first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print("\nQUESTION: \"%s\"" % query_text)
    
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print("ANSWER: \"%s\"\n" % answer)

    print("FIRST DOCUMENTS BY RELEVANCE:")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print("    [%0.4f] \"%s ...\"" % (score, doc.page_content[:84]))


QUESTION: "How many new jobs were created in 1999?"
ANSWER: "More than 425,000."

FIRST DOCUMENTS BY RELEVANCE:
    [0.9018] "positive economic indicators are now beginning to be reflected in thedaily lives of  ..."
    [0.8982] "The size of our economy will surpass the trillion-dollar mark
this year.
The deficit ..."
    [0.8881] "This is clearly good news.However, Mr. Speaker, while the progress of recent years i ..."
    [0.8823] "9speeche•good  2/27/00  11:08 AM  Page 9If we are to capitalize on the opportunities ..."

QUESTION: "Give top 5 gist of the speech."
ANSWER: "1. The importance of fostering a culture of innovation in order to succeed as a nation.
2. The need to inspire a spirit of entrepreneurship and see Canada as a place to live.
3. The recognition that both the private and public sectors have a role to play in capitalizing on opportunities in the new economy.
4. The emphasis on values such as caring, compassion, and equitable sharing of economic growth.
5. The focus o