<a href="https://colab.research.google.com/github/xxOmen/scrape_mcqs/blob/main/scrapper_2%2Bmcqsgen.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import PyPDF2
import random
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

# Function to extract MCQs from a given PDF file
def extract_mcqs_from_pdf(pdf_path):
    """
    Extracts text from the given PDF file and splits it into questions.
    Args:
        pdf_path (str): Path to the PDF file.
    Returns:
        list: A list of MCQs as strings.
    """
    mcqs = []
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            text = page.extract_text()
            # Split into MCQs assuming each question starts with "Q" (adjust if needed)
            questions = [q.strip() for q in text.split("\n") if q.startswith("Q")]
            mcqs.extend(questions)
    return mcqs

# Function to generate a random test and answer key
def generate_random_test(mcqs, num_questions, test_filename="test.pdf", answer_key_filename="answer_key.pdf"):
    """
    Generates a random MCQ test and an answer key.
    Args:
        mcqs (list): List of MCQs extracted from the PDF.
        num_questions (int): Number of questions to include in the test.
        test_filename (str): Output PDF file for the test.
        answer_key_filename (str): Output PDF file for the answer key.
    """
    # Ensure requested number of questions doesn't exceed available MCQs
    if num_questions > len(mcqs):
        print(f"Requested {num_questions} exceeds available MCQs ({len(mcqs)}). Using all available MCQs.")
        num_questions = len(mcqs)

    # Randomly select MCQs
    selected_mcqs = random.sample(mcqs, num_questions)

    # Generate the test PDF
    c = canvas.Canvas(test_filename, pagesize=A4)
    width, height = A4
    margin = 40
    y_position = height - margin

    c.setFont("Helvetica-Bold", 16)
    c.drawString(margin, y_position, "MCQs Test")
    y_position -= 30

    for i, mcq in enumerate(selected_mcqs, start=1):
        if y_position < 60:
            c.showPage()
            y_position = height - margin
        c.setFont("Helvetica", 12)
        c.drawString(margin, y_position, f"{i}. {mcq}")
        y_position -= 20

    c.save()
    print(f"Test saved as {test_filename}")

    # Generate the answer key PDF
    c = canvas.Canvas(answer_key_filename, pagesize=A4)
    y_position = height - margin

    c.setFont("Helvetica-Bold", 16)
    c.drawString(margin, y_position, "Answer Key")
    y_position -= 30

    for i in range(1, num_questions + 1):
        if y_position < 60:
            c.showPage()
            y_position = height - margin
        c.setFont("Helvetica", 12)
        c.drawString(margin, y_position, f"{i}. Placeholder Answer")
        y_position -= 20

    c.save()
    print(f"Answer key saved as {answer_key_filename}")

# Main function to process the uploaded PDF and generate MCQs
def main():
    # Specify the uploaded file path
    pdf_path = "/content/ilovepdf_merged.pdf"  # Replace with your uploaded file's name if different
    num_questions = int(input("Enter the number of MCQs for the test: "))

    # Step 2: Extract MCQs from the PDF
    mcqs = extract_mcqs_from_pdf(pdf_path)
    if not mcqs:
        print("No MCQs found in the file. Please check the PDF format.")
        return

    # Step 3: Generate the test and answer key
    test_filename = "generated_test.pdf"
    answer_key_filename = "generated_answer_key.pdf"
    generate_random_test(mcqs, num_questions, test_filename, answer_key_filename)

    # Step 4: Provide download links for the generated files
    from google.colab import files
    files.download(test_filename)
    files.download(answer_key_filename)

# Run the main function
if __name__ == "__main__":
    main()


In [None]:
!pip install requests beautifulsoup4 reportlab

import requests
from bs4 import BeautifulSoup
import time
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

# Base URL with placeholder for page number
base_url = "https://pakmcqs.com/category/mathematics-mcqs/page/{}"

# Define the range of pages you want to scrape
start_page = 2
end_page = 3  # Adjust as needed

# List to hold MCQs for adding to PDF later
mcqs = []

# Loop through the range of pages
for page in range(start_page, end_page + 1):
    # Format the URL with the current page number
    url = base_url.format(page)

    # Send a GET request to the current page
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve page {page}")
        continue

    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Select all MCQ posts on the page
    mcq_posts = soup.select("article.grid-card-post")

    # Loop through each MCQ post and extract relevant information
    for post in mcq_posts:
        try:
            # Extract the question title
            title = post.select_one("h2.post-title a").text.strip()

            # Extract the correct answer in the description section (assumes answer is bolded)
            description = post.select_one("div.excerpt").text.strip()
            description = description.split("Submitted by")[0].strip()

            correct_answer = post.select_one("div.excerpt strong")
            correct_answer = correct_answer.text.strip() if correct_answer else "Not specified"

            # Append data to mcqs list
            mcqs.append({
                "question": title,
                "description": description,
                "answer": correct_answer
            })
        except Exception as e:
            print(f"Error parsing an MCQ on page {page}: {e}")
            continue

    # Delay to avoid overwhelming the server
    time.sleep(1)  # Adjust the delay as needed
    print(f"Completed scraping page {page}")

# Function to wrap text within a given width (for questions only)
def wrap_text(text, width, canvas_obj, font, font_size):
    """Break text into lines that fit within the given width."""
    canvas_obj.setFont(font, font_size)
    words = text.split()
    lines = []
    line = ""
    for word in words:
        test_line = f"{line} {word}".strip()
        if canvas_obj.stringWidth(test_line, font, font_size) <= width:
            line = test_line
        else:
            lines.append(line)
            line = word
    lines.append(line)  # Append the last line
    return lines

# Function to generate PDF from the collected MCQs
def generate_pdf(mcqs, filename="mcqs_output.pdf"):
    c = canvas.Canvas(filename, pagesize=A4)
    width, height = A4
    margin = 40  # Set a margin for text
    bottom_margin = 60  # Space to leave at the bottom of the page
    y_position = height - margin  # Starting y position on the page

    for i, mcq in enumerate(mcqs, start=1):
        # Write Question with wrapping
        c.setFont("Helvetica-Bold", 12)
        question_lines = wrap_text(f"Q{i}: {mcq['question']}", width - 2 * margin, c, "Helvetica-Bold", 12)
        for line in question_lines:
            if y_position < bottom_margin:  # Check if there's enough space for text
                c.showPage()
                y_position = height - margin  # Reset y_position for the new page
            c.drawString(margin, y_position, line)
            y_position -= 15

        # Write Description with answer choices on separate lines, bolding the correct answer
        c.setFont("Helvetica", 10)
        y_position -= 5  # Slight gap before answer choices

        # Assuming answer choices are split by "A.", "B.", etc.
        choices = mcq['description'].replace("A.", "\nA.").replace("B.", "\nB.").replace("C.", "\nC.").replace("D.", "\nD.")
        for choice_line in choices.split("\n"):
            if y_position < bottom_margin:  # Check if there's enough space for text
                c.showPage()
                y_position = height - margin  # Reset y_position for the new page

            # Bold the correct answer choice
            if mcq["answer"] in choice_line:
                c.setFont("Helvetica-Bold", 10)  # Set font to bold for the correct answer
            else:
                c.setFont("Helvetica", 10)  # Set font back to regular for other options
            c.drawString(margin + 10, y_position, choice_line.strip())
            y_position -= 12

        y_position -= 10  # Extra space after each question

        # Create new page if y_position is too low
        if y_position < bottom_margin:
            c.showPage()
            y_position = height - margin  # Reset y_position for new page

    c.save()
    print(f"PDF generated: {filename}")

# Generate PDF from the collected MCQs
generate_pdf(mcqs, filename="mcqs_output_maths.pdf")


In [12]:
# Install required libraries
!pip install PyPDF2 reportlab
!apt-get install poppler-utils tesseract-ocr
!pip install pytesseract pdf2image PyPDF2 reportlab


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  tesseract-ocr-eng tesseract-ocr-osd
The following NEW packages will be installed:
  poppler-utils tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
0 upgraded, 4 newly installed, 0 to remove and 49 not upgraded.
Need to get 5,002 kB of archives.
After this operation, 16.3 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.5 [186 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-eng all 1:4.00~git30-7274cfa-1.1 [1,591 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr-osd all 1:4.00~git30-7274cfa-1.1 [2,990 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy/universe amd64 tesseract-ocr amd64 4.1.1-2.1build1 [236 kB]
Fetched 5,002 kB in 3s (1,823 kB/s)
Selecting previously unselected package popp

In [16]:

import pytesseract
from pdf2image import convert_from_path
import PyPDF2
import random
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas
import re

# OCR Function for Image-Based PDFs
def ocr_pdf_to_text(pdf_path):
    """
    Perform OCR on an image-based PDF to extract text.
    Args:
        pdf_path (str): Path to the PDF file.
    Returns:
        str: Extracted text from the PDF.
    """
    images = convert_from_path(pdf_path)
    extracted_text = ""
    for img in images:
        text = pytesseract.image_to_string(img)
        extracted_text += text + "\n"
    return extracted_text

# Function for Text-Based PDFs
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a text-based PDF file.
    Args:
        pdf_path (str): Path to the PDF file.
    Returns:
        str: Extracted text from the PDF.
    """
    extracted_text = ""
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            extracted_text += page.extract_text()
    return extracted_text

# Function to Extract MCQs from Text
def extract_mcqs_from_text(text):
    """
    Extracts MCQs from text content, handling questions and options dynamically.
    Args:
        text (str): Text content from the PDF.
    Returns:
        list: A list of MCQs with options.
    """
    mcqs = []
    lines = text.split("\n")
    question, options = None, []

    for line in lines:
        line = line.strip()
        if line.startswith("Q"):  # Identify a new question
            if question:
                # Save the previous question before starting a new one
                mcqs.append({"question": question, "options": options})
            question = line
            options = []
        elif re.match(r"^[A-D]\.", line):  # Match options like A., B., C., D.
            options.append(line)

    # Append the last question
    if question:
        mcqs.append({"question": question, "options": options})

    return mcqs

# Function to Wrap Text for PDF
def wrap_text(text, width, canvas_obj, font, font_size):
    """
    Break text into lines that fit within the given width.
    """
    canvas_obj.setFont(font, font_size)
    words = text.split()
    lines = []
    line = ""
    for word in words:
        test_line = f"{line} {word}".strip()
        if canvas_obj.stringWidth(test_line, font, font_size) <= width:
            line = test_line
        else:
            lines.append(line)
            line = word
    lines.append(line)
    return lines

# Function to Generate Test PDF
def generate_random_test(mcqs, num_questions, test_filename="test.pdf"):
    """
    Generates a random MCQ test.
    Args:
        mcqs (list): List of MCQs extracted from the PDF.
        num_questions (int): Number of questions to include in the test.
        test_filename (str): Output PDF file for the test.
    """
    if num_questions > len(mcqs):
        print(f"Requested {num_questions} exceeds available MCQs ({len(mcqs)}). Using all available MCQs.")
        num_questions = len(mcqs)

    selected_mcqs = random.sample(mcqs, num_questions)

    # Generate the test PDF
    c = canvas.Canvas(test_filename, pagesize=A4)
    width, height = A4
    margin = 40
    y_position = height - margin

    c.setFont("Helvetica-Bold", 16)
    c.drawString(margin, y_position, "MCQs Test")
    y_position -= 30

    for i, mcq in enumerate(selected_mcqs, start=1):
        if y_position < 80:
            c.showPage()
            y_position = height - margin
        # Question
        c.setFont("Helvetica-Bold", 12)
        question_lines = wrap_text(f"Q{i}: {mcq['question']}", width - 2 * margin, c, "Helvetica-Bold", 12)
        for line in question_lines:
            c.drawString(margin, y_position, line)
            y_position -= 15

        # Options
        c.setFont("Helvetica", 10)
        for option in mcq['options']:
            if y_position < 60:
                c.showPage()
                y_position = height - margin
            c.drawString(margin + 20, y_position, option)
            y_position -= 15

        y_position -= 10

    c.save()
    print(f"Test saved as {test_filename}")

# Main Function
def main():
    # Hardcoded Path
    pdf_path = "/content/ilovepdf_merged.pdf"  # Replace with your PDF file's path
    num_questions = 100
    # Hardcode the number of questions here

    # Step 2: Extract Text from PDF
    try:
        text = extract_text_from_pdf(pdf_path)
        mcqs = extract_mcqs_from_text(text)
    except Exception as e:
        print("Text-based extraction failed, trying OCR...")
        text = ocr_pdf_to_text(pdf_path)
        mcqs = extract_mcqs_from_text(text)

    if not mcqs:
        print("No MCQs found in the file. Please check the PDF format.")
        return

    # Step 3: Generate Test PDF
    generate_random_test(mcqs, num_questions, "mcqs_test.pdf")

    # Step 4: Provide Download Link
    from google.colab import files
    files.download("mcqs_test.pdf")

if __name__ == "__main__":
    main()


Test saved as mcqs_test.pdf


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>