In [1]:
import os

import cassio
from datasets import load_dataset
from dotenv import load_dotenv
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
from langchain.vectorstores.cassandra import Cassandra
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAI, OpenAIEmbeddings
from PyPDF2 import PdfReader
from typing_extensions import Concatenate

from secret.astra_token import token_json

load_dotenv("secret/.env")

  from .autonotebook import tqdm as notebook_tqdm
Python-dotenv could not parse statement starting at line 3


True

In [2]:
ASTRA_DB_APPLICATION_TOKEN = token_json["token"]
ASTRA_DB_ID = os.getenv("ASTRA_DB_ID")
OPEN_API_KEY = os.getenv("OPEN_API_KEY")

In [3]:
pdfreader = PdfReader("documents/budget_speech.pdf")

In [4]:
raw_text = ""
for i, page in enumerate(pdfreader.pages):
    content = page.extract_text()
    if content:
        raw_text += content

In [5]:
print(raw_text)

GOVERNMENT OF INDIA
BUDGET 2023-2024
SPEECH
OF
NIRMALA SITHARAMAN
MINISTER OF FINANCE
February 1,  2023CONTENTS 
PART-A 
 Page No.  
 Introduction 1 
 Achievements since 2014: Leaving no one behind 2 
 Vision for Amrit Kaal  – an empowered and inclusive economy 3 
 Priorities of this Budget 5 
i. Inclusive Development  
ii. Reaching the Last Mile 
iii. Infrastructure and Investment 
iv. Unleashing the Potential 
v. Green Growth 
vi. Youth Power  
vii. Financial Sector  
 
 
 
 
 
 
 
 
 Fiscal Management 24 
PART B  
  
Indirect Taxes  27 
 Green Mobility  
 Electronics   
 Electrical   
 Chemicals and Petrochemicals   
 Marine products  
 Lab Grown Diamonds  
 Precious Metals  
 Metals  
 Compounded Rubber  
 Cigarettes  
  
Direct Taxes  30 
 MSMEs and Professionals   
 Cooperation  
 Start-Ups  
 Appeals  
 Better targeting of tax concessions  
 Rationalisation  
 Others  
 Personal Income Tax  
  
Annexures  35 
 Annexure to Part B of the Budget Speech 2023-

In [6]:
cassio.init(token=ASTRA_DB_APPLICATION_TOKEN, database_id=ASTRA_DB_ID)

In [7]:
llm = OpenAI(openai_api_key=OPEN_API_KEY)
embedding = OpenAIEmbeddings(openai_api_key=OPEN_API_KEY)

In [8]:
astra_vector_store = Cassandra(
    embedding=embedding,
    table_name="qa_mini_demo",
    session=None,
    keyspace=None
)

In [10]:
text_spliiter = CharacterTextSplitter(
    separator="\n",
    chunk_size=800,
    chunk_overlap=200,
    length_function=len
)
texts = text_spliiter.split_text(raw_text)

In [12]:
texts[:5]

['GOVERNMENT OF INDIA\nBUDGET 2023-2024\nSPEECH\nOF\nNIRMALA SITHARAMAN\nMINISTER OF FINANCE\nFebruary 1,  2023CONTENTS \nPART-A \n Page No.  \n\uf0b7 Introduction 1 \n\uf0b7 Achievements since 2014: Leaving no one behind 2 \n\uf0b7 Vision for Amrit Kaal  – an empowered and inclusive economy 3 \n\uf0b7 Priorities of this Budget 5 \ni. Inclusive Development  \nii. Reaching the Last Mile \niii. Infrastructure and Investment \niv. Unleashing the Potential \nv. Green Growth \nvi. Youth Power  \nvii. Financial Sector  \n \n \n \n \n \n \n \n \n\uf0b7 Fiscal Management 24 \nPART B  \n  \nIndirect Taxes  27 \n\uf0b7 Green Mobility  \n\uf0b7 Electronics   \n\uf0b7 Electrical   \n\uf0b7 Chemicals and Petrochemicals   \n\uf0b7 Marine products  \n\uf0b7 Lab Grown Diamonds  \n\uf0b7 Precious Metals  \n\uf0b7 Metals  \n\uf0b7 Compounded Rubber  \n\uf0b7 Cigarettes  \n  \nDirect Taxes  30 \n\uf0b7 MSMEs and Professionals',
 '\uf0b7 Chemicals and Petrochemicals   \n\uf0b7 Marine products  \n\uf0b7 La

In [13]:
astra_vector_store.add_texts(texts[:50])
print(f"Interested {len(texts[:50])} headlines.")

astra_vector_index = VectorStoreIndexWrapper(vectorstore=astra_vector_store)

Interested 50 headlines.


In [16]:
first_question = True
while True:
    if first_question:
        query_text = input("\nEnter your question (or type 'quit' to exit): ").strip()
    else:
        query_text = input("\nWhat's your next question (or type 'quit' to exit): ").strip()

    if query_text.lower() == "quit":
        break

    if query_text == "":
        continue

    first_question = False

    print(f"\nQUESTION: {query_text}")
    answer = astra_vector_index.query(query_text, llm=llm).strip()
    print(f"\nANSWER: {answer}")

    print(f"\FIRST DOCUMENTS BY RELEVANCE: {query_text}")
    for doc, score in astra_vector_store.similarity_search_with_score(query_text, k=4):
        print(f"[{round(score, 4)}] {doc.page_content[:84]} ...")



QUESTION: What is the current GDP ?

ANSWER: 7 per cent.
\FIRST DOCUMENTS BY RELEVANCE: What is the current GDP ?
[0.8992] estimated to be at 7 per cent. It is notable that this is the highest among all 
the ...
[0.898] With the theme of ‘ Vasudhaiva Kutumbakam’ , we are steering an 
ambitious, people-c ...
[0.8962] multiplier impact on growth and employment. After the subdued period of 
the pandemi ...
[0.8949] February 1, 2023 
Hon’ble Speaker,  
 I present the Budget for 2023-24. This is the  ...
