In [18]:
# Import necessary modules and classes
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM 
from transformers import pipeline
import torch
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


In [19]:
import PyPDF2
import textwrap

# Open the PDF file
with open("./ipc-data.pdf", "rb") as file:
    reader = PyPDF2.PdfReader(file)
    document = reader.pages[0].extract_text()

# Initialize a text splitter to split documents into smaller chunks
chunk_size = 500
texts = textwrap.wrap(document, chunk_size)

In [20]:
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata

# Create a list of Document objects
documents = [Document(text) for text in texts]
embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")
# Create a Chroma vector database from the documents
db = Chroma.from_documents(documents, embeddings, persist_directory='./persist_directory/')

In [21]:
# Specify the checkpoint for the language model
checkpoint = "MBZUAI/LaMini-Flan-T5-783M"

# Initialize the tokenizer and base model for text generation
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
base_model = AutoModelForSeq2SeqLM.from_pretrained(
    checkpoint,
    torch_dtype=torch.float32
)

# Specify the device for the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = base_model.to(device)

In [22]:
# Create a text generation pipeline
pipe = pipeline(
    'text2text-generation',
    model = base_model,
    tokenizer = tokenizer,
    max_length = 512,
    do_sample = True,
    temperature = 0.3,
    top_p= 0.95
)


In [23]:
# Initialize a local language model pipeline
local_llm = HuggingFacePipeline(pipeline=pipe)
# Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type='stuff',
    retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
    return_source_documents=True,
)


In [24]:
# Prompt the user for a query
input_query = str(input("Enter your query:"))

# Execute the query using the QA chain
llm_response = qa_chain({"query": input_query})

# Print the response
print(llm_response['result'])


The provided context does not provide any information about terrorist acts. Therefore, the answer is "unknown".
