<a href="https://colab.research.google.com/github/xxOmen/scrape_mcqs/blob/main/update%201.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install requests beautifulsoup4 reportlab

import requests
from bs4 import BeautifulSoup
import time
from reportlab.lib.pagesizes import A4
from reportlab.pdfgen import canvas

# Base URL with placeholder for page number
base_url = "https://pakmcqs.com/category/mathematics-mcqs/page/{}"

# Define the range of pages you want to scrape
start_page = 2
end_page = 3  # Adjust as needed

# List to hold MCQs for adding to PDF later
mcqs = []

# Loop through the range of pages
for page in range(start_page, end_page + 1):
    # Format the URL with the current page number
    url = base_url.format(page)

    # Send a GET request to the current page
    response = requests.get(url)
    if response.status_code != 200:
        print(f"Failed to retrieve page {page}")
        continue

    # Parse the HTML content
    soup = BeautifulSoup(response.text, "html.parser")

    # Select all MCQ posts on the page
    mcq_posts = soup.select("article.grid-card-post")

    # Loop through each MCQ post and extract relevant information
    for post in mcq_posts:
        try:
            # Extract the question title
            title = post.select_one("h2.post-title a").text.strip()

            # Extract the correct answer in the description section (assumes answer is bolded)
            description = post.select_one("div.excerpt").text.strip()
            description = description.split("Submitted by")[0].strip()

            correct_answer = post.select_one("div.excerpt strong")
            correct_answer = correct_answer.text.strip() if correct_answer else "Not specified"

            # Append data to mcqs list
            mcqs.append({
                "question": title,
                "description": description,
                "answer": correct_answer
            })
        except Exception as e:
            print(f"Error parsing an MCQ on page {page}: {e}")
            continue

    # Delay to avoid overwhelming the server
    time.sleep(1)  # Adjust the delay as needed
    print(f"Completed scraping page {page}")

# Function to wrap text within a given width (for questions only)
def wrap_text(text, width, canvas_obj, font, font_size):
    """Break text into lines that fit within the given width."""
    canvas_obj.setFont(font, font_size)
    words = text.split()
    lines = []
    line = ""
    for word in words:
        test_line = f"{line} {word}".strip()
        if canvas_obj.stringWidth(test_line, font, font_size) <= width:
            line = test_line
        else:
            lines.append(line)
            line = word
    lines.append(line)  # Append the last line
    return lines

# Function to generate PDF from the collected MCQs
def generate_pdf(mcqs, filename="mcqs_output.pdf"):
    c = canvas.Canvas(filename, pagesize=A4)
    width, height = A4
    margin = 40  # Set a margin for text
    bottom_margin = 60  # Space to leave at the bottom of the page
    y_position = height - margin  # Starting y position on the page

    for i, mcq in enumerate(mcqs, start=1):
        # Write Question with wrapping
        c.setFont("Helvetica-Bold", 12)
        question_lines = wrap_text(f"Q{i}: {mcq['question']}", width - 2 * margin, c, "Helvetica-Bold", 12)
        for line in question_lines:
            if y_position < bottom_margin:  # Check if there's enough space for text
                c.showPage()
                y_position = height - margin  # Reset y_position for the new page
            c.drawString(margin, y_position, line)
            y_position -= 15

        # Write Description with answer choices on separate lines, bolding the correct answer
        c.setFont("Helvetica", 10)
        y_position -= 5  # Slight gap before answer choices

        # Assuming answer choices are split by "A.", "B.", etc.
        choices = mcq['description'].replace("A.", "\nA.").replace("B.", "\nB.").replace("C.", "\nC.").replace("D.", "\nD.")
        for choice_line in choices.split("\n"):
            if y_position < bottom_margin:  # Check if there's enough space for text
                c.showPage()
                y_position = height - margin  # Reset y_position for the new page

            # Bold the correct answer choice
            if mcq["answer"] in choice_line:
                c.setFont("Helvetica-Bold", 10)  # Set font to bold for the correct answer
            else:
                c.setFont("Helvetica", 10)  # Set font back to regular for other options
            c.drawString(margin + 10, y_position, choice_line.strip())
            y_position -= 12

        y_position -= 10  # Extra space after each question

        # Create new page if y_position is too low
        if y_position < bottom_margin:
            c.showPage()
            y_position = height - margin  # Reset y_position for new page

    c.save()
    print(f"PDF generated: {filename}")

# Generate PDF from the collected MCQs
generate_pdf(mcqs, filename="mcqs_output_maths.pdf")


Collecting reportlab
  Downloading reportlab-4.2.5-py3-none-any.whl.metadata (1.5 kB)
Downloading reportlab-4.2.5-py3-none-any.whl (1.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: reportlab
Successfully installed reportlab-4.2.5
Completed scraping page 2
Completed scraping page 3
PDF generated: mcqs_output_maths.pdf
