In [None]:
!pip install PyPDF2 # Install the PyPDF2 library using pip

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m246.9 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1


In [None]:
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 49 not upgraded.
Need to get 186 kB of archives.
After this operation, 696 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.5 [186 kB]
Fetched 186 kB in 1s (194 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 123629 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.5_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.5) ...
Setting up poppler-utils (22.02.0-2ubuntu0.5) ...
Processing triggers for man-db (2.10.2-1) ...


In [None]:
!pip install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0


In [None]:
!pip install pytesseract
!sudo apt install tesseract-ocr

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 4,816 kB of archives.
After this operation, 15.6 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 

In [8]:
import os
from typing import List
from transformers import pipeline
import PyPDF2

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text content directly from a PDF file using PyPDF2."""
    try:
        with open(pdf_path, 'rb') as file:
            pdf_reader = PyPDF2.PdfReader(file)
            text = ''
            for page_num in range(len(pdf_reader.pages)):
                page = pdf_reader.pages[page_num]
                text += page.extract_text()
        return text
    except (FileNotFoundError, PyPDF2.utils.PdfReadError) as e:
        print(f"Error extracting text from PDF: {e}")
        return ""

def answer_questions(pdf_text: str, questions: List[str]) -> List[str]:
    """Answer questions based on the PDF text content using an external QA model."""
    # Load a pre-trained question answering model, e.g., from the Hugging Face Transformers library
    qa_model = pipeline('question-answering', model='distilbert-base-uncased-distilled-squad')

    answers = []
    for question in questions:
        if not pdf_text:
            answers.append("Sorry, I couldn't extract any text from the provided PDF.")
        else:
            result = qa_model({
                'context': pdf_text,
                'question': question
            })
            answers.append(result['answer'])

    return answers

def run_qa_session(pdf_path: str):
    if os.path.exists(pdf_path):
        pdf_text = extract_text_from_pdf(pdf_path)
        while True:
            question = input("\nPlease ask your question (or type 'quit' to exit): ")
            if question.lower() == 'quit':
                print("Thank you for using the QA system. Goodbye!")
                break
            answers = answer_questions(pdf_text, [question])
            print(f"Answer: {answers[0]}")
    else:
        print(f"Error: File not found at {pdf_path}")

# Example usage
pdf_path = '/content/Wood is Good, Grow More, Use More Magazine, Vol. II. Issue II. July-September 2021.pdf'  # Replace with the correct file path
run_qa_session(pdf_path)


Please ask your question (or type 'quit' to exit): what is the important part of a sandalwook tree?
Answer: seed oil

Please ask your question (or type 'quit' to exit): in which soil is it optimal to grow?
Answer: salt

Please ask your question (or type 'quit' to exit): quit
Thank you for using the QA system. Goodbye!
