In [5]:
import PyPDF2
import textwrap
from langchain.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
import torch
from langchain.llms import HuggingFacePipeline
from langchain.embeddings import SentenceTransformerEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pickle

# Define the Document class
class Document:
    def __init__(self, page_content, metadata=None):
        self.page_content = page_content
        self.metadata = metadata

# Open the PDF file
with open("./ipc-data.pdf", "rb") as file:
    reader = PyPDF2.PdfReader(file)
    document = reader.pages[0].extract_text()

# Initialize a text splitter to split documents into smaller chunks
chunk_size = 500
texts = textwrap.wrap(document, chunk_size)

# Create a list of Document objects
documents = [Document(text) for text in texts]

# Define the embeddings
embeddings = SentenceTransformerEmbeddings(model_name="multi-qa-mpnet-base-dot-v1")

# Create a Chroma vector database from the documents
db = Chroma.from_documents(documents, embeddings, persist_directory='./persist_directory/')

# Specify the checkpoint for the language model
checkpoint = "MBZUAI/LaMini-Flan-T5-783M"

# Initialize the tokenizer and base model for text generation
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
base_model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint, torch_dtype=torch.float32)

# Specify the device for the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
base_model = base_model.to(device)

# Create a text generation pipeline
pipe = pipeline(
    'text2text-generation',
    model=base_model,
    tokenizer=tokenizer,
    max_length=512,
    do_sample=True,
    temperature=0.3,
    top_p=0.95
)

# Initialize a local language model pipeline
local_llm = HuggingFacePipeline(pipeline=pipe)

# Create a RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm=local_llm,
    chain_type='stuff',
    retriever=db.as_retriever(search_type="similarity", search_kwargs={"k": 2}),
    return_source_documents=True,
)

# Prompt the user for a query
input_query = str(input("Enter your query: "))

try:
    # Execute the query using the QA chain
    llm_response = qa_chain({"query": input_query})

    # Print the response
    print(llm_response['result'])
except Exception as e:
    print(f"An error occurred: {e}")

# Save the trained model to a pickle file
with open("trained_model.pkl", "wb") as file:
    pickle.dump(base_model, file)


  warn_deprecated(


The context does not provide information about the labor law.
