In [35]:
# Install necessary libraries
!pip install langchain openai tiktoken rapidocr-onnxruntime PyMuPDF faiss-cpu

from google.colab import userdata
import os



In [36]:
# Fetch OpenAI API key and set environment variable
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY')
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY


In [37]:
# Data Ingestion
import fitz  # PyMuPDF
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [38]:
# Function to extract text from PDF
def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

In [39]:
# Path to your PDF file
pdf_path = "/content/data.pdf"

In [40]:
# Extract text from PDF
rawdata = extract_text_from_pdf(pdf_path)

In [41]:
# Split the extracted text into chunks
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
text_chunks = text_splitter.split_text(rawdata)

In [42]:
# Convert text chunks into Document format required by FAISS
from langchain.docstore.document import Document

documents = [Document(page_content=chunk) for chunk in text_chunks]

print(documents)



In [43]:
pip install -U langchain-community



In [44]:
# Install FAISS and set up vector store
from langchain.vectorstores import FAISS
from langchain.prompts import ChatPromptTemplate

In [45]:
# Initialize OpenAI embeddings
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

vectorstore = FAISS.from_documents(documents, embeddings)
retriever = vectorstore.as_retriever()

In [46]:
# Create prompt template
template = """You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer the question.
If you don't know the answer, just say that you don't know.
Use ten sentences maximum and keep the answer concise.
Question: {question}

Context: {context}

Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

In [47]:
# Handle deprecation warnings and set up chat model
from langchain.chat_models import ChatOpenAI
llm_model = ChatOpenAI(openai_api_key=OPENAI_API_KEY, model_name="gpt-3.5-turbo")

In [48]:
# Output parser
from langchain.schema.output_parser import StrOutputParser
output_parser = StrOutputParser()

In [49]:
# Create RAG chain
from langchain.schema.runnable import RunnablePassthrough

rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm_model
    | output_parser
)

In [51]:
# Invoke RAG chain with sample questions
print(rag_chain.invoke("What is the Cusip in the document?"))

The CUSIP in the document is 40434YQJ9.


In [55]:
rag_chain.invoke("Who is the Issuer of this document?")

'The Issuer of this document is HSBC Bank USA, National Association.'

In [56]:
rag_chain.invoke("Can you tell me the name of the Issuer mentioned in this document?")

'The Issuer mentioned in the document is HSBC Bank USA, National Association.'