COGS 160 Auto-Grader Notebook for Architect Assignments

 Imports

In [1]:
import os
import re
import json
import fitz  # PyMuPDF for PDF parsing
from PIL import Image
from io import BytesIO
from urllib.parse import urlparse
import spacy
nlp = spacy.load("en_core_web_sm")

Rubric Scoring Criteria

In [2]:
rubric = {
    "architect_chosen": 5,
    "bio_750_words": 10,
    "bio_structure": 10,
    "bio_references": 10,
    "10_buildings_with_images": 15,
    "image_quality": 10,
    "image_citations": 10,
    "personal_bio_photo": 5,
    "doc_and_slides": 5,
    "image_relevance": 10,
    "presentation_polish": 20,
}

Extract Text from PDF

In [3]:
def extract_text_from_pdf(pdf_path):
    text = ""
    doc = fitz.open(pdf_path)
    for page in doc:
        text += page.get_text()
    return text

In [4]:
pdf_path = "/Users/tanishqsingh/Downloads/cogs160submisson1.pdf"

Extract Images from PDF

In [5]:
def extract_images_from_pdf(pdf_path, min_width=1200):
    doc = fitz.open(pdf_path)
    image_data = []
    for page_index in range(len(doc)):
        images = doc.get_page_images(page_index)
        for img_index, img in enumerate(images):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            img_pil = Image.open(BytesIO(image_bytes))
            width, height = img_pil.size
            image_data.append({
                "page": page_index + 1,
                "width": width,
                "height": height,
                "is_high_res": width >= min_width
            })
    return image_data

Evaluate Image Quality

In [6]:
def evaluate_image_quality(image_data):
    total = len(image_data)
    high_res_count = sum(1 for img in image_data if img["is_high_res"])
    score = int((high_res_count / max(1, total)) * rubric["image_quality"])
    return {
        "total_images": total,
        "high_res_count": high_res_count,
        "score": score
    }

 Word Count & Structure Evaluation

In [7]:
def evaluate_biography(text):
    result = {}
    doc = nlp(text)
    result["word_count"] = len([token.text for token in doc if token.is_alpha])

    required_sections = ["who they are", "studied", "first building", "significance", "influence"]
    section_hits = sum([1 for section in required_sections if section.lower() in text.lower()])
    result["structure_score"] = int((section_hits / len(required_sections)) * rubric["bio_structure"])

    result["score"] = rubric["bio_750_words"] if result["word_count"] >= 700 else int((result["word_count"] / 750) * rubric["bio_750_words"])
    return result

Reference & Citation Validator (Inline Extraction)

In [8]:
def extract_references_from_text(text):
    # Grab all lines that look like APA references (Year + optional URL)
    lines = text.split("\n")
    references = []
    for line in lines:
        if re.search(r"\(\d{4}\)", line):  # contains (Year)
            if any(x in line.lower() for x in ["doi", "archdaily", "e-architect", "https://"]):
                references.append(line.strip())
    return references

def evaluate_references(ref_list):
    valid = [ref for ref in ref_list if "doi" in ref.lower() or "archdaily" in ref.lower()]
    return {
        "valid_references": len(valid),
        "score": min(len(valid), rubric["bio_references"])
    }

Generate Final Score

In [9]:
def generate_scorecard(scores):
    total = sum([v["score"] for v in scores.values()])
    return {
        "scorecard": {k: v["score"] for k, v in scores.items()},
        "final_score": total,
        "grade": "A" if total >= 90 else "B" if total >= 80 else "C" if total >= 70 else "D"
    }

Run All-in-One Evaluation (PDF Only)

In [10]:
def run_autograder(pdf_path):
    doc_text = extract_text_from_pdf(pdf_path)
    image_data = extract_images_from_pdf(pdf_path)
    ref_list = extract_references_from_text(doc_text)

    results = {}
    results["bio"] = evaluate_biography(doc_text)
    results["references"] = evaluate_references(ref_list)
    results["images"] = evaluate_image_quality(image_data)
    results["citations"] = {"score": min(len(ref_list), 10)}  # Basic citation count

    return generate_scorecard(results)

In [11]:
result = run_autograder(pdf_path)
print(json.dumps(result, indent=2))


{
  "scorecard": {
    "bio": 10,
    "references": 0,
    "images": 6,
    "citations": 0
  },
  "final_score": 16,
  "grade": "D"
}
