In [1]:
import fitz  
import json
import re

def extract_questions_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()
    return text

def parse_questions(text, mark_label):
    section = re.search(rf"{mark_label}.*?(5 Marks|$)", text, re.DOTALL | re.IGNORECASE)
    if not section:
        return {}

    section_text = section.group(0)
    question_blocks = re.split(r"\n\d+\.\s", section_text)[1:]  
    question_data = {}
    for i, block in enumerate(question_blocks, 1):
        q_number = f"Q{i}"
        question_match = re.match(r"(.*?)(Solution:|STEP 1:)", block, re.DOTALL)
        question = question_match.group(1).strip().replace("\n", " ") if question_match else "Unknown"

        steps = re.findall(r"STEP \d+: (.*?)(?=STEP \d+:|$)", block, re.DOTALL)
        cleaned_steps = [re.sub(r"\s+", " ", step).strip() for step in steps]
        
        question_data[q_number] = {
            "question": question,
            "steps": len(cleaned_steps),
            "answers": cleaned_steps
        }
    return question_data

def convert_to_json(pdf_path):
    full_text = extract_questions_from_pdf(pdf_path)
    two_mark_questions = parse_questions(full_text, "2 Marks")
    five_mark_questions = parse_questions(full_text, "5 Marks")

    final_json = {
        "2_marks": two_mark_questions,
        "5_marks": five_mark_questions
    }
    return final_json

pdf_path = r"D:\BNR\Task2\math_machine_task_2.pdf"  
result = convert_to_json(pdf_path)

with open("answer_key.json", "w", encoding="utf-8") as f:
    json.dump(result, f, indent=2, ensure_ascii=False)

In [53]:
import requests
import os
from pdf2image import convert_from_path
import cv2
import numpy as np
from PIL import Image


API_KEY = 'K88030080888957'
PDF_PATH = r"C:\Users\SABAREESH\Downloads\Text02.pdf"
TEMP_FOLDER = 'temp_images'
DPI = 300
USE_PREPROCESSING = True  

if not os.path.exists(TEMP_FOLDER):
    os.makedirs(TEMP_FOLDER)

def preprocess_image(image_path):
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                cv2.THRESH_BINARY, 11, 2)
    cv2.imwrite(image_path, img)

pages = convert_from_path(PDF_PATH, dpi=DPI,poppler_path= r"C:\poppler\Library\bin")
ocr_text = []

for i, page in enumerate(pages):
    image_path = os.path.join(TEMP_FOLDER, f'page_{i+1}.png')
    page.save(image_path, 'PNG')

    if USE_PREPROCESSING:
        preprocess_image(image_path)

    print(f"OCR processing Page {i+1}...")

    with open(image_path, 'rb') as img_file:
        response = requests.post(
            'https://api.ocr.space/parse/image',
            files={'file': img_file},
            data={
                'apikey': API_KEY,
                'language': 'eng',
                'isTable': True,
                'scale': True,
                'OCREngine': 2
            }
        )

    try:
        result = response.json()
        parsed = result['ParsedResults'][0]['ParsedText']
        ocr_text.append(f"{parsed.strip()}")
    except Exception as e:
        ocr_text.append(f"[ERROR] {e}")

final_output = "\n".join(ocr_text)

OCR processing Page 1...
OCR processing Page 2...
OCR processing Page 3...
OCR processing Page 4...
OCR processing Page 5...
OCR processing Page 6...


In [54]:
final_output

"2 Marks\t\r\nQ2. Find the sum of the first odd numbers:.\t\r\nSolution:-\t\r\nSTEP 1: Finst, list the pist 5 odd numbers: 1, 3,5,7,9\t\r\nSTEP 2:- Now, we have to bin the odd numbers\t\r\n1+334\t\r\n4+5 = 9\t\r\n9+7=16\t\r\n16+9=25\t\r\nSTEP 3: The total sein the odd number is 25\t\r\nQ3. What numbers comes neat in the pattern: 1, 2,4,8,16..?\t\r\nSolution! -\t\r\nStep 1:- First, observe the pattern\t\r\nStep 2: 1x2=2\t\r\n2x2 = 4\t\r\n4x2:8\t\r\n8x2 = 16\t\r\nSTEP 3: Nosit, 16 bg 2\t\r\n16x2 = 52\t\r\nSTEP 4: The nes number is 32\nQ5. What shape comes after a hescapon in regular\t\r\npolygen sequence?\t\r\nSolutiOn: -\t\r\nSTEP 1: A hercagon has 6 sides\t\r\nSTEP 2: We are moving prward by 1.\t\r\nSTEP 3: 14 bides is hexagon.\t\r\nQ7. What is the iith cube number?\t\r\nSolution:.\t\r\nStep 1: we have to find the cube of h\t\r\nstep 2: The 1ith cube number is ol\t\r\nWhat is the sun of 12+5+2+1?\t\r\nQ1O\t\r\nSolution:.\t\r\nSTEP 1 : Pattern:\t\r\n1+2=3\t\r\n3+3 = 0\t\r\n6+2=8\t\r\n8t

In [67]:
import json
import re

def parse_questions(text):
    questions = {}
    blocks = re.split(r"\nQ\d+[^\n]*", text)
    q_titles = re.findall(r"\n(Q\d+[^\n]*)", text)

    for i, block in enumerate(blocks[1:]): 
        q_title = q_titles[i].strip()
        q_id = re.search(r"Q\d+", q_title).group()

        question_text = q_title.split(".", 1)[-1].strip()
        steps = re.findall(r"(?:STEP|Step|steph)[^\d]*\d*[:\.\-]?\s*(.*?)(?=(?:STEP|Step|steph)[^\d]*\d*[:\.\-]?|Q\d+|$)", block, re.DOTALL)
        steps = [re.sub(r"\s+", " ", step).strip() for step in steps if step.strip()]

        questions[q_id] = {
            "question": question_text,
            "steps": len(steps),
            "answers": steps
        }

    return questions

def convert_to_json(text):
    data = {
        "2_marks": {},
        "5_marks": {}
    }

    match = re.search(r"2 Marks(.*?)(5 Marks|$)", text, re.DOTALL | re.IGNORECASE)
    if match:
        data["2_marks"] = parse_questions(match.group(1))
    match = re.search(r"5 Marks(.*)", text, re.DOTALL | re.IGNORECASE)
    if match:
        data["5_marks"] = parse_questions(match.group(1))

    return data

result = convert_to_json(final_output)

with open("ocr_answer_key.json", "w", encoding="utf-8") as f:
    json.dump(result, f, indent=2, ensure_ascii=False)

In [73]:
import json
from difflib import SequenceMatcher

with open('answer_key.json', 'r', encoding='utf-8') as f:
    answer_key = json.load(f)

with open('ocr_answer_key.json', 'r', encoding='utf-8') as f:
    ocr_answer_key = json.load(f)

def similarity(a, b):
    return SequenceMatcher(None, a.lower().strip(), b.lower().strip()).ratio()

def score_question(correct, ocr, step_mark, answer_mark):
    if not ocr['answers']:
        return 0.0

    correct_steps = correct['steps']
    ocr_steps = len(ocr['answers'])
    matched_steps = 0
    min_steps = min(correct_steps, ocr_steps)

    for i in range(min_steps):
        if similarity(correct['answers'][i], ocr['answers'][i]) > 0.5:
            matched_steps += 1

    step_score = (ocr_steps / correct_steps) * (step_mark / 2)
    match_score = (matched_steps / correct_steps) * (step_mark / 2)

    final_correct = correct['answers'][-1]
    final_ocr = ocr['answers'][-1]
    final_score = answer_mark if similarity(final_correct, final_ocr) > 0.5 else 0

    return round(step_score + match_score + final_score, 2)

def evaluate_section(section_name, step_mark, answer_mark, max_per_qn):
    score = {}
    total = 0
    max_total = 0
    for qid, ocr_q in ocr_answer_key[section_name].items():
        if qid in answer_key[section_name]:
            marks = score_question(answer_key[section_name][qid], ocr_q, step_mark, answer_mark)
            score[qid] = marks
        else:
            score[qid] = 0
        total += score[qid]
        max_total += max_per_qn
    return score, total, max_total

marks_2, total_2, max_2 = evaluate_section("2_marks", 1, 1, 2)
marks_5, total_5, max_5 = evaluate_section("5_marks", 2, 3, 5)

print("2-mark Questions:")
for q, m in marks_2.items():
    print(f"{q}: {m}/2")

print("\n5-mark Questions:")
for q, m in marks_5.items():
    print(f"{q}: {m}/5")

print(f"\nTotal Score: {total_2 + total_5}/{max_2 + max_5}")


2-mark Questions:
Q2: 2.0/2
Q3: 0.62/2
Q5: 1.62/2
Q7: 1.83/2

5-mark Questions:
Q1: 2.5/5
Q6: 1.12/5

Total Score: 9.690000000000001/18
