In [13]:
# prompt: in the pdf there is a question and answer pages from page 9 to end pages, with question in pink colour and answer following it in black collar can you write a code to extract the questions in csv file for question column and answer column seperately but can't we do that using any meta data information available line starts with a number followed by dot and ends with question mark

import PyPDF2
import re
import pandas as pd

def extract_qa_from_pdf(pdf_path):
  """Extracts questions and answers from a PDF file.

  Args:
    pdf_path: The path to the PDF file.

  Returns:
    A list of dictionaries, where each dictionary contains a 'question' and 'answer'.
  """
  with open(pdf_path, 'rb') as pdf_file:
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    num_pages = len(pdf_reader.pages)
    qa_pairs = []
    current_question = None
    current_answer = ""

    for page_num in range(8, 40):  # Start from page 9
      page = pdf_reader.pages[page_num]
      page_text = page.extract_text()
      lines = page_text.split('\n')

      for line in lines:
        # Check if the line starts with a number followed by a dot and ends with a question mark
        match = re.match(r"^\d+\.\s*(.*?)\?$", line)
        if match:
          if current_question:
            qa_pairs.append({"question": current_question, "answer": current_answer.strip()})
          current_question = match.group(1)
          current_answer = ""
        else:
          if current_question:
            current_answer += line + "\n"

    if current_question:
      qa_pairs.append({"question": current_question, "answer": current_answer.strip()})

    return qa_pairs

# Replace 'your_pdf_file.pdf' with the actual path to your PDF file.
pdf_file_path = '/content/drive/MyDrive/aquabot/FAQ English book.pdf'
qa_data = extract_qa_from_pdf(pdf_file_path)

# Create a Pandas DataFrame from the extracted data
df = pd.DataFrame(qa_data)

# Save the DataFrame to a CSV file
df.to_csv('qa_from_pdf.csv', index=False)

print("Questions and answers extracted and saved to qa_from_pdf.csv")

Questions and answers extracted and saved to qa_from_pdf.csv


In [3]:
!cd drive

In [15]:
import pandas as pd
df=pd.read_csv('/content/drive/MyDrive/aquabot/qa_from_pdf.csv')

In [17]:
df.head()

Unnamed: 0,question,answer
0,What measures need to be taken to disinfect th...,"In case of completely non-drainable ponds, the..."
1,Does bleaching affect the efficiency of minera...,No. Disinfection of pond water with bleaching ...
2,Does the ground water also require disinfection,Disinfection is carried out to kill unwanted o...
3,Do we need to apply lime after disinfecting th...,Liming of shrimp ponds is done to neutralize s...
4,Is aeration before stocking is necessary,Farmers apply fermented organic juice for the ...


In [53]:
# prompt: need langchain code to have a model from HuggingFace to answer the csv questions along with HuggingFace token

!pip install langchain huggingface_hub transformers langchain-community sentence-transformers faiss-cpu


from langchain.document_loaders import CSVLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub

# Load CSV data
loader = CSVLoader(file_path='/content/drive/MyDrive/aquabot/qa_from_pdf.csv')
data = loader.load()

# Use lighter, faster embeddings
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
#    model_kwargs={'device': 'cuda'} # if you have GPU
#)

# Create FAISS index
db = FAISS.from_documents(data, embeddings)

# Use smaller model
llm = HuggingFaceHub(
    repo_id="google/flan-t5-base",  # or base version for even faster results
    model_kwargs={
        "temperature": 0.1,
        "max_length": 512,
        "truncation": True
    },
    huggingfacehub_api_token=access_token
)









In [54]:
# Optimize retriever settings
retriever = db.as_retriever(search_kwargs={"k": 10})
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=retriever,
    return_source_documents=False
)

In [59]:
# Query
query = "what is the creature cultured here"
result = qa.run(query)

In [60]:
print(result)

Shrimp farmers should procure seeds only from the registered hatcheries following due quality screening procedures.
