# Step 6.1: Basic Reporting & Archiving
Generates score reports, scored PDFs, samples, and performs backup.

**Features:**
- ‚úÖ Score calculation and Excel reporting
- ‚úÖ Scored script image generation
- ‚úÖ Individual PDF generation
- ‚úÖ Sample collection generation
- ‚úÖ Project backup

In [1]:
from grading_utils import setup_paths, create_directories, build_student_id_mapping
import os
import json
import pandas as pd
import shutil
import time
from datetime import datetime
from pathlib import Path
from PIL import Image
import cv2
from IPython.display import display, clear_output
from ipywidgets import IntProgress, HTML
import logging
from pypdf import PdfReader, PdfWriter
import re
import matplotlib.pyplot as plt
import seaborn as sns
import hashlib
import math
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.shared import Pt, Inches


# Robust logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("‚úÖ Robust Step 6: Post-Scoring Packaging initialized")
print(f"‚úì Session started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Configuration
passingMark = 15  # Adjust as needed
prefix = "VTC Test"
paths = setup_paths(prefix, "sample")

# Extract commonly used paths
pdf_file = paths["pdf_file"]
name_list_file = paths["name_list_file"]
base_path = paths["base_path"]
base_path_images = paths["base_path_images"]
base_path_annotations = paths["base_path_annotations"]
base_path_questions = paths["base_path_questions"]
base_path_marked_images = paths["base_path_marked_images"]
base_path_marked_pdfs = paths["base_path_marked_pdfs"]
base_path_marked_scripts = paths["base_path_marked_scripts"]
CACHE_DIR = paths.get("cache_dir", "../cache")

# Create all necessary directories
create_directories(paths)

print("‚úì Paths configured and directories created")

# Metadata questions that should be excluded from answer analysis
METADATA_QUESTIONS = ["NAME", "ID", "CLASS"]

print("üí° Metadata questions (NAME, ID, CLASS) will be excluded from answer analysis")

‚úÖ Robust Step 6: Post-Scoring Packaging initialized
‚úì Session started at: 2026-01-08 07:38:28
‚úì Paths configured and directories created
üí° Metadata questions (NAME, ID, CLASS) will be excluded from answer analysis


In [2]:
# Robust score report generation with validation and analytics

def generate_score_report():
    """Generate comprehensive score report with validation and analytics."""
    logger.info("üìä Generating score report...")
    try:
        name_list_df = pd.read_excel(name_list_file, sheet_name="Name List")
        id_col = next((col for col in name_list_df.columns if col.lower() == "id"), None)
        name_col = next(
            (col for col in name_list_df.columns if col.lower() in {"name", "student name", "student_name"}),
            None,
        )
        if id_col is None or name_col is None:
            raise ValueError("Name list must contain ID and NAME columns.")

        name_map = (
            name_list_df.assign(**{id_col: name_list_df[id_col].astype(str)})
            .set_index(id_col)[name_col]
            .astype(str)
            .to_dict()
        )
        logger.info(f"‚úì Loaded {len(name_map)} student names from name list")

        pageToStudentId, numberOfPage, getStudentId = build_student_id_mapping(
            base_path_questions, base_path_annotations
        )
        logger.info(f"‚úì Built student ID mapping for {numberOfPage} pages")

        questionAndMarks = {}
        questions_processed = 0
        for path, _, files in os.walk(base_path_questions):
            for file in files:
                if file == "mark.json":
                    question = path[len(base_path_questions) + 1 :]
                    try:
                        with open(os.path.join(path, file), "r", encoding="utf-8") as f:
                            data = json.load(f)
                        marks = {}
                        for item in data:
                            studentId = getStudentId(int(item["id"]))
                            marks[studentId] = (
                                item["overridedMark"] if item["overridedMark"] != "" else item["mark"]
                            )
                        questionAndMarks[question] = marks
                        questions_processed += 1
                        logger.info(f"‚úì Processed marks for {question}: {len(marks)} students")
                    except Exception as e:
                        logger.error(f"‚ùå Failed to process marks for {question}: {e}")
                        continue

        logger.info(f"‚úì Processed marks from {questions_processed} questions")
        if not questionAndMarks:
            raise ValueError("No question marks were processed.")
       
        marksDf = pd.DataFrame(questionAndMarks)

        # Reorder columns: ID, NAME, CLASS first, then questions sorted
        base_cols = ["ID", "NAME", "CLASS"]
        question_cols = [
            col
            for col in sorted(marksDf.columns)
            if col not in base_cols
        ]
        marksDf = marksDf[base_cols + question_cols]

        marksDf["NAME"] = marksDf["ID"].map(name_map).fillna(marksDf["NAME"])

        

        # Calculate total marks from question columns only
        marksDf["Marks"] = (
            marksDf.loc[:, ~marksDf.columns.isin(["ID", "NAME", "CLASS"])]
            .apply(pd.to_numeric, errors="coerce")
            .sum(axis=1)
        )

        invalid_marks = marksDf[marksDf["Marks"].isna()]
        if not invalid_marks.empty:
            logger.warning(f"Found {len(invalid_marks)} students with invalid marks")

        logger.info(f"‚úì Generated marks report for {len(marksDf)} students")
        logger.info(f"  Average score: {marksDf['Marks'].mean():.2f}")
        logger.info(f"  Score range: {marksDf['Marks'].min():.1f} - {marksDf['Marks'].max():.1f}")

        return marksDf
    except Exception as e:
        logger.error(f"‚ùå Score report generation failed: {e}")
        raise


marksDf = generate_score_report()
display(marksDf)


2026-01-08 07:38:28,206 - INFO - üìä Generating score report...
2026-01-08 07:38:28,392 - INFO - ‚úì Loaded 4 student names from name list
2026-01-08 07:38:28,395 - INFO - ‚úì Built student ID mapping for 2 pages
2026-01-08 07:38:28,396 - INFO - ‚úì Processed marks for Q5: 4 students
2026-01-08 07:38:28,399 - INFO - ‚úì Processed marks for Q4: 4 students
2026-01-08 07:38:28,401 - INFO - ‚úì Processed marks for NAME: 4 students
2026-01-08 07:38:28,403 - INFO - ‚úì Processed marks for CLASS: 4 students
2026-01-08 07:38:28,406 - INFO - ‚úì Processed marks for Q3: 4 students
2026-01-08 07:38:28,409 - INFO - ‚úì Processed marks for Q2: 4 students
2026-01-08 07:38:28,412 - INFO - ‚úì Processed marks for Q1: 4 students
2026-01-08 07:38:28,421 - INFO - ‚úì Processed marks for ID: 4 students
2026-01-08 07:38:28,424 - INFO - ‚úì Processed marks from 8 questions
2026-01-08 07:38:28,446 - INFO - ‚úì Generated marks report for 4 students
2026-01-08 07:38:28,450 - INFO -   Average score: 11.00
2026

Unnamed: 0,ID,NAME,CLASS,Q1,Q2,Q3,Q4,Q5,Marks
234567890,234567890,John,C,0.0,1.0,3.0,0.0,5.0,9.0
123456789,123456789,Peter,A,2.0,6.0,1.0,8.0,3.0,20.0
987654321,987654321,Mary,B,1.0,6.0,0.0,0.0,0.0,7.0
345678912,345678912,Susan,D,2.0,6.0,0.0,0.0,0.0,8.0


In [3]:
# Robust scored scripts creation with comprehensive validationdef create_scored_scripts():    """Create scored scripts with validation and error handling"""    print("üìÑ Creating scored scripts...")        try:        # Copy raw images to marked folder with validation        if os.path.exists(base_path_marked_images):            shutil.rmtree(base_path_marked_images)                copied_path = shutil.copytree(base_path_images, base_path_marked_images)                # Validate copy operation        original_files = len([f for f in os.listdir(base_path_images) if f.endswith('.jpg')])        copied_files = len([f for f in os.listdir(base_path_marked_images) if f.endswith('.jpg')])                if original_files != copied_files:            raise Exception(f"Image copy validation failed: {original_files} original vs {copied_files} copied")                logger.info(f"‚úì Copied {copied_files} images to marked folder")                # Load and validate annotations        annotations_path = base_path_annotations + "annotations.json"        with open(annotations_path, "r") as f:             annotations = json.load(f)                # Flatten annotations to list with validation        annotations_list = []        for page in annotations:            for annotation in annotations[page]:                annotation["page"] = int(page)                # x to left, y to top                annotation["left"] = annotation["x"]                annotation["top"] = annotation["y"]                annotation.pop("x")                annotation.pop("y")                annotations_list.append(annotation)                # Convert annotations_list to dict with key with label        annotations_dict = {}        for annotation in annotations_list:            annotations_dict[annotation["label"]] = annotation                logger.info(f"‚úì Processed {len(annotations_dict)} annotations")                # Build student ID to page mapping        studentIdToPage = {}        with open(os.path.join(base_path_questions, "ID", "mark.json")) as f:            data = json.load(f)            for i in data:                studentId = i["overridedMark"] if i["overridedMark"] != "" else i["mark"]                studentIdToPage[studentId] = int(i["id"])                logger.info(f"‚úì Built student-to-page mapping for {len(studentIdToPage)} students")                # Add marks to images with progress tracking        marksDf_list = marksDf.to_dict(orient="records")                progress = IntProgress(min=0, max=len(marksDf_list), description='Adding marks')        display(progress)                processed_students = 0        failed_students = []                for student in marksDf_list:            try:                first_page = studentIdToPage[student["ID"]]                                for annotation in annotations_dict:                    value = student[annotation]                    if annotation == "ID":                        value = value + " Marks: " + str(student["Marks"])                                        x = annotations_dict[annotation]["left"]                    y = annotations_dict[annotation]["top"]                    page = first_page + annotations_dict[annotation]["page"]                                      image_path = base_path_marked_images + str(page) + ".jpg"                                        if not os.path.exists(image_path):                        logger.warning(f"Image not found: {image_path}")                        continue                                        # Add text to image with error handling                    try:                        img = cv2.imread(image_path)                        if img is None:                            logger.warning(f"Failed to load image: {image_path}")                            continue                                                textSize = cv2.getTextSize(text=str(value), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, thickness=2)                        height = textSize[0][1]                        cv2.putText(img, str(value), (x, y + height), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)                        cv2.imwrite(image_path, img)                                            except Exception as e:                        logger.warning(f"Failed to add text to {image_path}: {e}")                        continue                                processed_students += 1                            except Exception as e:                logger.error(f"Failed to process student {student['ID']}: {e}")                failed_students.append(student['ID'])                        progress.value += 1                logger.info(f"‚úì Added marks to images for {processed_students} students")        if failed_students:            logger.warning(f"Failed to process {len(failed_students)} students: {failed_students}")                return studentIdToPage, processed_students, failed_students            except Exception as e:        logger.error(f"‚ùå Scored scripts creation failed: {e}")        raise# Create scored scriptsstudentIdToPage, processed_students, failed_students = create_scored_scripts()# Create scored scriptsstudentIdToPage, processed_students, failed_students = create_scored_scripts()
def create_scored_scripts():
    """Create scored scripts with validation and error handling."""
    print("üìÑ Creating scored scripts...")

    try:
        if os.path.exists(base_path_marked_images):
            shutil.rmtree(base_path_marked_images)

        shutil.copytree(base_path_images, base_path_marked_images)

        original_files = len([f for f in os.listdir(base_path_images) if f.endswith(".jpg")])
        copied_files = len([f for f in os.listdir(base_path_marked_images) if f.endswith(".jpg")])
        if original_files != copied_files:
            raise Exception(f"Image copy validation failed: {original_files} original vs {copied_files} copied")

        logger.info(f"‚úì Copied {copied_files} images to marked folder")

        annotations_path = os.path.join(base_path_annotations, "annotations.json")
        with open(annotations_path, "r") as f:
            annotations = json.load(f)

        annotations_dict = {}
        for page, page_ann in annotations.items():
            for annotation in page_ann:
                annotation = annotation.copy()
                annotation["page"] = int(page)
                annotation["left"] = annotation.pop("x")
                annotation["top"] = annotation.pop("y")
                annotations_dict[annotation["label"]] = annotation

        logger.info(f"‚úì Processed {len(annotations_dict)} annotations")

        studentIdToPage = {}
        id_mark_path = os.path.join(base_path_questions, "ID", "mark.json")
        with open(id_mark_path) as f:
            data = json.load(f)
        for i in data:
            studentId = i["overridedMark"] if i["overridedMark"] != "" else i["mark"]
            studentIdToPage[str(studentId)] = int(i["id"])
        logger.info(f"‚úì Built student-to-page mapping for {len(studentIdToPage)} students")

        marksDf_list = marksDf.to_dict(orient="records")
        progress = IntProgress(min=0, max=len(marksDf_list), description="Adding marks")
        display(progress)

        processed_students = 0
        failed_students = []

        for student in marksDf_list:
            try:
                first_page = studentIdToPage.get(str(student["ID"]))
                if first_page is None:
                    logger.warning(f"No page mapping for student {student['ID']}")
                    failed_students.append(student["ID"])
                    progress.value += 1
                    continue

                for label, annotation in annotations_dict.items():
                    value = student.get(label, "")
                    if label == "ID":
                        value = f"{value} Marks: {student.get('Marks', '')}"
                    if pd.isna(value):
                        continue

                    x, y = annotation["left"], annotation["top"]
                    page = first_page + annotation["page"]
                    image_path = os.path.join(base_path_marked_images, f"{page}.jpg")

                    if not os.path.exists(image_path):
                        logger.warning(f"Image not found: {image_path}")
                        continue

                    try:
                        img = cv2.imread(image_path)
                        if img is None:
                            logger.warning(f"Failed to load image: {image_path}")
                            continue

                        text = str(value)
                        (text_width, text_height), baseline = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 1, 2)
                        cv2.putText(img, text, (x, y + text_height), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                        cv2.imwrite(image_path, img)
                    except Exception as e:
                        logger.warning(f"Failed to add text to {image_path}: {e}")
                        continue

                processed_students += 1
            except Exception as e:
                logger.error(f"Failed to process student {student.get('ID')}: {e}")
                failed_students.append(student.get("ID"))
            finally:
                progress.value += 1

        logger.info(f"‚úì Added marks to images for {processed_students} students")
        if failed_students:
            logger.warning(f"Failed to process {len(failed_students)} students: {failed_students}")

        return studentIdToPage, processed_students, failed_students

    except Exception as e:
        logger.error(f"‚ùå Scored scripts creation failed: {e}")
        raise

studentIdToPage, processed_students, failed_students = create_scored_scripts()

2026-01-08 07:38:28,829 - INFO - ‚úì Copied 8 images to marked folder
2026-01-08 07:38:28,835 - INFO - ‚úì Processed 8 annotations
2026-01-08 07:38:28,838 - INFO - ‚úì Built student-to-page mapping for 4 students


üìÑ Creating scored scripts...


IntProgress(value=0, description='Adding marks', max=4)

2026-01-08 07:38:30,022 - INFO - ‚úì Added marks to images for 4 students


In [4]:
# Robust PDF generation with comprehensive validation
def generate_pdfs(studentIdToPage, numberOfPage):
    """Generate individual PDFs with validation and error handling"""
    print("üìÑ Generating individual PDFs...")
    
    try:
        marksDf_list = marksDf.to_dict(orient="records")
        
        pdf_generation_stats = {
            'successful': 0,
            'failed': 0,
            'errors': []
        }
        
        for student in marksDf_list:
            try:
                studentId = str(student["ID"])
                first_page = studentIdToPage.get(studentId)
                
                if first_page is None:
                    error_msg = f"No page mapping found for student {studentId}"
                    logger.error(error_msg)
                    pdf_generation_stats['errors'].append(error_msg)
                    pdf_generation_stats['failed'] += 1
                    continue
                
                last_page = first_page + numberOfPage - 1
                
                logger.info(f"Processing PDF for {studentId}: pages {first_page}-{last_page}")
                
                pdf_path = os.path.join(base_path_marked_pdfs, f"{studentId}.pdf")
                
                # Validate all required images exist
                image_paths = [os.path.join(base_path_marked_images, f"{i}.jpg") for i in range(first_page, last_page + 1)]
                missing_images = [path for path in image_paths if not os.path.exists(path)]
                
                if missing_images:
                    error_msg = f"Missing images for {studentId}: {len(missing_images)} files"
                    logger.error(error_msg)
                    pdf_generation_stats['errors'].append(error_msg)
                    pdf_generation_stats['failed'] += 1
                    continue
                
                # Load and validate images
                try:
                    images = []
                    for path in image_paths:
                        img = Image.open(path)
                        if img.mode != 'RGB':
                            img = img.convert('RGB')
                        images.append(img)
                    
                    # Create PDF with validation
                    if images:
                        images[0].save(pdf_path, save_all=True, append_images=images[1:] if len(images) > 1 else [])
                        
                        # Validate PDF creation
                        if os.path.exists(pdf_path) and os.path.getsize(pdf_path) > 0:
                            pdf_generation_stats['successful'] += 1
                            logger.info(f"‚úì Created PDF for {studentId}: {os.path.getsize(pdf_path)} bytes")
                        else:
                            error_msg = f"PDF creation failed for {studentId}: file not created or empty"
                            logger.error(error_msg)
                            pdf_generation_stats['errors'].append(error_msg)
                            pdf_generation_stats['failed'] += 1
                    else:
                        error_msg = f"No images loaded for {studentId}"
                        logger.error(error_msg)
                        pdf_generation_stats['errors'].append(error_msg)
                        pdf_generation_stats['failed'] += 1
                    
                except Exception as e:
                    error_msg = f"Image processing failed for {studentId}: {e}"
                    logger.error(error_msg)
                    pdf_generation_stats['errors'].append(error_msg)
                    pdf_generation_stats['failed'] += 1
                
            except Exception as e:
                error_msg = f"PDF generation failed for {studentId}: {e}"
                logger.error(error_msg)
                pdf_generation_stats['errors'].append(error_msg)
                pdf_generation_stats['failed'] += 1
        
        # Display generation summary
        total = pdf_generation_stats['successful'] + pdf_generation_stats['failed']
        print(f"\nüìä PDF Generation Summary:")
        print(f"   Successful: {pdf_generation_stats['successful']}")
        print(f"   Failed: {pdf_generation_stats['failed']}")
        if total > 0:
            print(f"   Success rate: {pdf_generation_stats['successful']/total*100:.1f}%")
        
        if pdf_generation_stats['errors']:
            print(f"\n‚ùå Errors encountered:")
            for error in pdf_generation_stats['errors'][:5]:  # Show first 5 errors
                print(f"   ‚Ä¢ {error}")
            if len(pdf_generation_stats['errors']) > 5:
                print(f"   ... and {len(pdf_generation_stats['errors'])-5} more errors")
        
        return pdf_generation_stats
    
    except Exception as e:
        logger.error(f"‚ùå PDF generation failed: {e}")
        raise

# Generate PDFs
# Get numberOfPage from the student ID mapping
pageToStudentId, numberOfPage, getStudentId = build_student_id_mapping(
    base_path_questions, base_path_annotations
)
pdf_stats = generate_pdfs(studentIdToPage, numberOfPage)


2026-01-08 07:38:30,095 - INFO - Processing PDF for 234567890: pages 4-5
2026-01-08 07:38:30,257 - INFO - ‚úì Created PDF for 234567890: 247594 bytes
2026-01-08 07:38:30,260 - INFO - Processing PDF for 123456789: pages 0-1


üìÑ Generating individual PDFs...


2026-01-08 07:38:30,359 - INFO - ‚úì Created PDF for 123456789: 252643 bytes
2026-01-08 07:38:30,361 - INFO - Processing PDF for 987654321: pages 2-3
2026-01-08 07:38:30,446 - INFO - ‚úì Created PDF for 987654321: 234373 bytes
2026-01-08 07:38:30,449 - INFO - Processing PDF for 345678912: pages 6-7
2026-01-08 07:38:30,538 - INFO - ‚úì Created PDF for 345678912: 243113 bytes



üìä PDF Generation Summary:
   Successful: 4
   Failed: 0
   Success rate: 100.0%


In [5]:
# Robust sample generation with comprehensive validation
def generate_samples():
    """Generate sample PDFs with validation and error handling"""
    print("üìö Generating sample collections...")
    
    try:
        # Create combined PDF of all scripts
        writer = PdfWriter()
        
        pdf_files_added = 0
        for path, currentDirectory, files in os.walk(base_path_marked_pdfs):
            for file in files:
                if file.endswith(".pdf"):
                    pdf_path = os.path.join(path, file)
                    try:
                        reader = PdfReader(pdf_path)
                        for page in reader.pages:
                            writer.add_page(page)
                        pdf_files_added += 1
                    except Exception as e:
                        logger.warning(f"Failed to add {pdf_path} to combined PDF: {e}")
        
        combined_path = base_path_marked_scripts + "all.pdf"
        with open(combined_path, "wb") as f:
            writer.write(f)
        
        logger.info(f"‚úì Created combined PDF with {pdf_files_added} individual PDFs")
        
        # Generate stratified samples with validation
        sampling = marksDf.sort_values(by=["Marks"], ascending=False)["Marks"]
        
        from_directory = os.path.join(os.getcwd(), "..", "templates", "pdf")
        
        # Validate template files exist
        template_files = {
            'good': os.path.join(from_directory, "Good.pdf"),
            'average': os.path.join(from_directory, "Average.pdf"),
            'weak': os.path.join(from_directory, "Weak.pdf")
        }
        
        missing_templates = [name for name, path in template_files.items() if not os.path.exists(path)]
        if missing_templates:
            logger.warning(f"Missing template files: {missing_templates}")
            logger.info("Creating sample without templates...")
            return
        
        try:
            goodPage = PdfReader(template_files['good'])
            averagePage = PdfReader(template_files['average'])
            weakPage = PdfReader(template_files['weak'])
        except Exception as e:
            logger.warning(f"Failed to load template files: {e}")
            logger.info("Creating sample without templates...")
            return
        
        def get_scripts_pdf(df):
            return list(map(lambda rowNumber: base_path_marked_pdfs + rowNumber + ".pdf", df.index))
        
        def take_sample(n, sampling, suffix=""):
            """Robust sample generation with validation"""
            try:
                if len(sampling) < 3 * n:
                    n = max(1, int(len(sampling) / 3))
                    logger.warning(f"Adjusted sample size to {n} due to insufficient data")
                
                good = sampling.head(n)
                weak = sampling.tail(n)
                median = int(len(sampling) / 2)
                take = max(1, int(n / 2))
                average = sampling.iloc[median - take : median + take]
                
                writer = PdfWriter()
                
                # Add template pages and student PDFs with validation
                for page in goodPage.pages:
                    writer.add_page(page)
                
                for pdf in get_scripts_pdf(good):
                    if os.path.exists(pdf):
                        try:
                            reader = PdfReader(pdf)
                            for page in reader.pages:
                                writer.add_page(page)
                        except Exception as e:
                            logger.warning(f"Failed to add {pdf}: {e}")
                
                for page in averagePage.pages:
                    writer.add_page(page)
                
                for pdf in get_scripts_pdf(average):
                    if os.path.exists(pdf):
                        try:
                            reader = PdfReader(pdf)
                            for page in reader.pages:
                                writer.add_page(page)
                        except Exception as e:
                            logger.warning(f"Failed to add {pdf}: {e}")
                
                for page in weakPage.pages:
                    writer.add_page(page)
                
                for pdf in get_scripts_pdf(weak):
                    if os.path.exists(pdf):
                        try:
                            reader = PdfReader(pdf)
                            for page in reader.pages:
                                writer.add_page(page)
                        except Exception as e:
                            logger.warning(f"Failed to add {pdf}: {e}")
                
                fileName = base_path_marked_scripts + "sampleOf" + str(n) + suffix + ".pdf"
                with open(fileName, "wb") as f:
                    writer.write(f)
                
                # Validate sample creation
                if os.path.exists(fileName) and os.path.getsize(fileName) > 0:
                    logger.info(f"‚úì Created sample: {fileName} ({os.path.getsize(fileName)} bytes)")
                else:
                    logger.error(f"‚ùå Failed to create sample: {fileName}")
                    
            except Exception as e:
                logger.error(f"‚ùå Sample generation failed for n={n}, suffix={suffix}: {e}")
        
        # Generate different sample sizes
        take_sample(3, sampling)
        take_sample(5, sampling)
        
        # Generate samples for passing students only
        passing_sampling = sampling.where(lambda x: x > passingMark).dropna()
        if len(passing_sampling) >= 3:
            take_sample(3, passing_sampling, "_only_pass")
        if len(passing_sampling) >= 5:
            take_sample(5, passing_sampling, "_only_pass")
        else:
            logger.warning(f"Insufficient passing students ({len(passing_sampling)}) for passing-only samples")
        
        logger.info("‚úì Sample generation completed")
        
    except Exception as e:
        logger.error(f"‚ùå Sample generation failed: {e}")
        raise

# Generate samples
generate_samples()


2026-01-08 07:38:30,627 - INFO - ‚úì Created combined PDF with 4 individual PDFs
2026-01-08 07:38:30,660 - INFO - ‚úì Created sample: ../marking_form/VTC Test/marked/scripts/sampleOf1.pdf (977946 bytes)
2026-01-08 07:38:30,684 - INFO - ‚úì Created sample: ../marking_form/VTC Test/marked/scripts/sampleOf1.pdf (977946 bytes)
2026-01-08 07:38:30,688 - INFO - ‚úì Sample generation completed


üìö Generating sample collections...


In [6]:
# Robust answer collection and reasoning with metadata exclusion
import pandas as pd
import re

def clean_answer_text(val: str) -> str:
    """Strip leading numbering and drop standalone question labels like Q2."""
    if not isinstance(val, str):
        return val
    lines = [ln.strip() for ln in str(val).splitlines()]
    cleaned = []
    for ln in lines:
        ln = re.sub(r"^\s*\d+\s*[\.|\)]\s*", "", ln)
        if re.fullmatch(r"q\d+", ln, flags=re.IGNORECASE):
            continue
        if ln:
            cleaned.append(ln)
    return "\n".join(cleaned).strip()

def collect_answers_and_reasoning():
    """Gather per-student answers and model reasoning from each question's CSV and pivot to a mark-style wide format."""
    print("üìù Collecting answers and reasoning from question data...")
    
    answer_rows = []
    reasoning_rows = []
    questions_processed = 0

    for path, currentDirectory, files in os.walk(base_path_questions):
        if "data.csv" not in files:
            continue

        question = path[len(base_path_questions) + 1 :]
        
        # Skip metadata questions (NAME, ID, CLASS)
        if question in METADATA_QUESTIONS:
            logger.info(f"‚è≠Ô∏è Skipping metadata question: {question}")
            continue
            
        data_path = os.path.join(path, "data.csv")
        
        try:
            df = pd.read_csv(data_path)
            if "page" not in df.columns:
                logger.warning(f"No 'page' column in {question} data.csv")
                continue

            # Map scanned page back to student ID using the existing helper
            df["StudentID"] = df["page"].apply(
                lambda p: getStudentId(int(str(p).split(".")[0])) if pd.notna(p) else None
            )

            for _, row in df.iterrows():
                sid = row.get("StudentID")
                if sid is None:
                    continue

                raw_answer = row.get("Answer", "")
                answer_val = clean_answer_text(raw_answer)
                source_page = row.get("page", "")
                row_number = row.get("RowNumber", "")

                answer_rows.append(
                    {
                        "ID": str(sid),
                        "Question": question,
                        "Answer": answer_val,
                        "SourcePage": source_page,
                        "RowNumber": row_number,
                    }
                )

                reasoning_rows.append(
                    {
                        "ID": str(sid),
                        "Question": question,
                        "Reasoning": row.get("Reasoning", ""),
                        "Similarity": row.get("Similarity", ""),
                        "ModelMark": row.get("Mark", ""),
                        "Answer": answer_val,
                        "SourcePage": source_page,
                        "RowNumber": row_number,
                    }
                )
            
            questions_processed += 1
            logger.info(f"‚úì Processed answers for {question}: {len(df)} entries")
            
        except Exception as e:
            logger.error(f"‚ùå Failed to process {question}: {e}")
            continue

    answers_df = pd.DataFrame(answer_rows)
    reasoning_df = pd.DataFrame(reasoning_rows)

    if not answers_df.empty:
        answers_df = answers_df.sort_values(
            by=["ID", "Question", "SourcePage", "RowNumber"]
        ).reset_index(drop=True)
    if not reasoning_df.empty:
        reasoning_df = reasoning_df.sort_values(
            by=["ID", "Question", "SourcePage", "RowNumber"]
        ).reset_index(drop=True)

    # Preserve ID/NAME/CLASS to match marks layout
    meta_cols = ["ID", "NAME", "CLASS"]
    student_meta = marksDf[meta_cols].drop_duplicates().set_index("ID")

    # Keep question ordering aligned with marks sheet (excluding metadata)
    question_cols = [
        col
        for col in marksDf.columns
        if col not in ["ID", "NAME", "CLASS", "Marks"] and col not in METADATA_QUESTIONS
    ]

    # Wide answers: one row per student, one column per question
    answers_wide = student_meta.copy()
    if not answers_df.empty:
        answers_pivot = answers_df.pivot_table(
            index="ID", columns="Question", values="Answer", aggfunc="first"
        )
        answers_pivot = answers_pivot.reindex(columns=question_cols)
        answers_wide = answers_wide.join(answers_pivot)
        answers_wide = answers_wide.reset_index()

    # Wide reasoning: only the reasoning text per question (matches marks layout)
    reasoning_wide = student_meta.copy()
    if not reasoning_df.empty:
        reasoning_pivot = reasoning_df.pivot_table(
            index="ID", columns="Question", values="Reasoning", aggfunc="first"
        )
        reasoning_pivot = reasoning_pivot.reindex(columns=question_cols)
        reasoning_wide = reasoning_wide.join(reasoning_pivot)
        reasoning_wide = reasoning_wide.reset_index()

    logger.info(f"‚úì Collected answers and reasoning from {questions_processed} questions")
    logger.info(f"  Answer entries: {len(answers_df)}")
    logger.info(f"  Reasoning entries: {len(reasoning_df)}")
    
    return answers_wide, reasoning_wide, answers_df, reasoning_df

# Collect answers and reasoning
answers_sheet, reasoning_sheet, answers_raw, reasoning_raw = collect_answers_and_reasoning()

# Collect answers and reasoning
answers_sheet, reasoning_sheet, answers_raw, reasoning_raw = collect_answers_and_reasoning()

2026-01-08 07:38:30,764 - INFO - ‚úì Processed answers for Q5: 4 entries
2026-01-08 07:38:30,772 - INFO - ‚úì Processed answers for Q4: 4 entries
2026-01-08 07:38:30,773 - INFO - ‚è≠Ô∏è Skipping metadata question: NAME
2026-01-08 07:38:30,774 - INFO - ‚è≠Ô∏è Skipping metadata question: CLASS
2026-01-08 07:38:30,780 - INFO - ‚úì Processed answers for Q3: 4 entries
2026-01-08 07:38:30,789 - INFO - ‚úì Processed answers for Q2: 4 entries
2026-01-08 07:38:30,796 - INFO - ‚úì Processed answers for Q1: 4 entries
2026-01-08 07:38:30,797 - INFO - ‚è≠Ô∏è Skipping metadata question: ID


üìù Collecting answers and reasoning from question data...


2026-01-08 07:38:30,836 - INFO - ‚úì Collected answers and reasoning from 5 questions
2026-01-08 07:38:30,839 - INFO -   Answer entries: 20
2026-01-08 07:38:30,840 - INFO -   Reasoning entries: 20
2026-01-08 07:38:30,847 - INFO - ‚úì Processed answers for Q5: 4 entries
2026-01-08 07:38:30,852 - INFO - ‚úì Processed answers for Q4: 4 entries
2026-01-08 07:38:30,853 - INFO - ‚è≠Ô∏è Skipping metadata question: NAME
2026-01-08 07:38:30,854 - INFO - ‚è≠Ô∏è Skipping metadata question: CLASS
2026-01-08 07:38:30,862 - INFO - ‚úì Processed answers for Q3: 4 entries
2026-01-08 07:38:30,876 - INFO - ‚úì Processed answers for Q2: 4 entries
2026-01-08 07:38:30,890 - INFO - ‚úì Processed answers for Q1: 4 entries
2026-01-08 07:38:30,893 - INFO - ‚è≠Ô∏è Skipping metadata question: ID
2026-01-08 07:38:30,920 - INFO - ‚úì Collected answers and reasoning from 5 questions
2026-01-08 07:38:30,922 - INFO -   Answer entries: 20
2026-01-08 07:38:30,922 - INFO -   Reasoning entries: 20


üìù Collecting answers and reasoning from question data...


In [7]:
# Robust Excel report generation with comprehensive analytics
def generate_comprehensive_excel_report():
    """Generate comprehensive Excel report with multiple sheets and analytics"""
    print("üìä Generating comprehensive Excel reports...")
    
    try:
        details_report_path = base_path_marked_scripts + "details_score_report.xlsx"
        
        # Multi-sheet Excel: marks, answers (wide), reasoning (wide) + raw long-form for audit
        with pd.ExcelWriter(details_report_path, engine='openpyxl') as writer:
            # Main sheets
            marksDf.to_excel(writer, sheet_name="Marks", index=False)
            
            if not answers_sheet.empty:
                answers_sheet.to_excel(writer, sheet_name="Answers", index=False)
            else:
                pd.DataFrame({"Note": ["No answer data available"]}).to_excel(writer, sheet_name="Answers", index=False)
            
            if not reasoning_sheet.empty:
                reasoning_sheet.to_excel(writer, sheet_name="Reasoning", index=False)
            else:
                pd.DataFrame({"Note": ["No reasoning data available"]}).to_excel(writer, sheet_name="Reasoning", index=False)
            
            # Raw data sheets for audit
            if not answers_raw.empty:
                answers_raw.to_excel(writer, sheet_name="AnswersRaw", index=False)
            if not reasoning_raw.empty:
                reasoning_raw.to_excel(writer, sheet_name="ReasoningRaw", index=False)

        # Lightweight summary sheet
        summary_path = base_path_marked_scripts + "score_report.xlsx"
        marksDf[["ID", "NAME", "CLASS", "Marks"]].to_excel(summary_path, index=False)

        logger.info(f"‚úì Generated comprehensive Excel report: {details_report_path}")
        logger.info(f"‚úì Generated summary Excel report: {summary_path}")
        
        return details_report_path, summary_path
        
    except Exception as e:
        logger.error(f"‚ùå Excel report generation failed: {e}")
        raise

# Generate comprehensive Excel reports
details_report_path, summary_report_path = generate_comprehensive_excel_report()
print(f"üìÑ Excel reports saved:")
print(f"   ‚Ä¢ Detailed: {os.path.basename(details_report_path)}")
print(f"   ‚Ä¢ Summary: {os.path.basename(summary_report_path)}")

# Generate comprehensive Excel reports
details_report_path, summary_report_path = generate_comprehensive_excel_report()
print(f"üìÑ Excel reports saved:")
print(f"   ‚Ä¢ Detailed: {os.path.basename(details_report_path)}")
print(f"   ‚Ä¢ Summary: {os.path.basename(summary_report_path)}")

üìä Generating comprehensive Excel reports...


2026-01-08 07:38:31,084 - INFO - ‚úì Generated comprehensive Excel report: ../marking_form/VTC Test/marked/scripts/details_score_report.xlsx
2026-01-08 07:38:31,086 - INFO - ‚úì Generated summary Excel report: ../marking_form/VTC Test/marked/scripts/score_report.xlsx
2026-01-08 07:38:31,155 - INFO - ‚úì Generated comprehensive Excel report: ../marking_form/VTC Test/marked/scripts/details_score_report.xlsx
2026-01-08 07:38:31,157 - INFO - ‚úì Generated summary Excel report: ../marking_form/VTC Test/marked/scripts/score_report.xlsx


üìÑ Excel reports saved:
   ‚Ä¢ Detailed: details_score_report.xlsx
   ‚Ä¢ Summary: score_report.xlsx
üìä Generating comprehensive Excel reports...
üìÑ Excel reports saved:
   ‚Ä¢ Detailed: details_score_report.xlsx
   ‚Ä¢ Summary: score_report.xlsx


In [8]:
# Robust backup and cleanup with validation
def backup_and_cleanup():
    """Robust backup with comprehensive validation and error handling"""
    print("üßπ Performing backup and cleanup...")
    
    try:
        # Remove version history files with progress tracking
        version_files_removed = 0
        for path, _, files in os.walk(base_path_questions):
            for file in files:
                if file.startswith("control-") or file.startswith("mark-"):
                    try:
                        os.remove(os.path.join(path, file))
                        version_files_removed += 1
                    except Exception as e:
                        logger.warning(f"Failed to remove {file}: {e}")
        logger.info(f"‚úì Removed {version_files_removed} version history files")
        
        # Create backup archive with validation
        backup_path = shutil.make_archive(base_path, "zip", base_path)
        if os.path.exists(backup_path):
            backup_size = os.path.getsize(backup_path)
            logger.info(f"‚úì Created backup archive: {backup_path}")
            logger.info(f"  Archive size: {backup_size:,} bytes ({backup_size/1024/1024:.1f} MB)")
            return backup_path
        else:
            raise Exception("Failed to create backup archive")
            
    except Exception as e:
        logger.error(f"‚ùå Backup and cleanup failed: {e}")
        raise

# Perform backup and cleanup
backup_path = backup_and_cleanup()


2026-01-08 07:38:31,207 - INFO - ‚úì Removed 0 version history files


üßπ Performing backup and cleanup...


2026-01-08 07:38:31,562 - INFO - ‚úì Created backup archive: /home/user/gemini-handwriting-grader/marking_form/VTC Test.zip
2026-01-08 07:38:31,565 - INFO -   Archive size: 3,060,489 bytes (2.9 MB)
2026-01-08 07:38:31,565 - INFO -   Archive size: 3,060,489 bytes (2.9 MB)


In [9]:
# Robust final summary and next steps
def generate_final_summary():
    """Generate comprehensive final summary with actionable next steps"""
    
    print("\n" + "="*70)
    print("üéâ ENHANCED STEP 6: POST-SCORING PACKAGING COMPLETE")
    print("="*70)
    
    # Overall statistics
    total_students = len(marksDf)
    avg_score = marksDf['Marks'].mean()
    passing_students = len(marksDf[marksDf['Marks'] > passingMark])
    pass_rate = (passing_students / total_students * 100) if total_students > 0 else 0
    
    print(f"\nüìä Processing Results:")
    print(f"   Total students processed: {total_students}")
    print(f"   Average score: {avg_score:.2f}")
    print(f"   Passing students: {passing_students} ({pass_rate:.1f}%)")
    print(f"   Score range: {marksDf['Marks'].min():.1f} - {marksDf['Marks'].max():.1f}")
    
    print(f"\nüìÅ Generated Files:")
    print(f"   ‚úÖ Backup archive: {os.path.basename(backup_path)}")
    print(f"   ‚úÖ Individual PDFs: {pdf_stats['successful']} created")
    print(f"   ‚úÖ Combined PDF: all.pdf")
    print(f"   ‚úÖ Sample collections: Multiple stratified samples")
    print(f"   ‚úÖ Comprehensive Excel reports with multiple sheets:")
    print(f"      ‚Ä¢ Marks, Answers, Reasoning (wide format)")
    print(f"      ‚Ä¢ Raw data for audit trail")
    print(f"      ‚Ä¢ AI-powered Performance reports")
    print(f"      ‚Ä¢ Class-level analytics and overview")
    print(f"      ‚Ä¢ Question-level metrics and statistics")
    print(f"   ‚úÖ Visual analytics: Question performance charts")
    
    if pdf_stats['failed'] > 0:
        print(f"   ‚ö†Ô∏è PDF generation issues: {pdf_stats['failed']} failed")
    
    if failed_students:
        print(f"   ‚ö†Ô∏è Student processing issues: {len(failed_students)} students")
    
    print(f"\nü§ñ AI-Robust Features:")
    if not performance_df.empty:
        print(f"   ‚úÖ Individual performance reports: {len(performance_df)} generated")
    if not class_overview_df.empty:
        print(f"   ‚úÖ Class-level analytics with AI insights")
    print(f"   ‚úÖ Metadata questions properly excluded from analysis")
    print(f"   ‚úÖ Comprehensive caching for efficient re-runs")
    
    print(f"\nüéØ Next Steps:")
    print(f"   1. üìß Proceed to Step 7: Email Score Distribution")
    print(f"   2. üìä Review detailed analytics in Excel reports")
    print(f"   3. üìÑ Use sample PDFs for moderation and review")
    print(f"   4. ü§ñ Review AI-generated performance insights")
    print(f"   5. üìà Analyze question-level metrics for curriculum improvement")
    print(f"   6. üíæ Archive backup file for long-term storage")
    
    print(f"\nüí° Robust Quality Assurance:")
    print(f"   ‚Ä¢ All processing includes comprehensive validation")
    print(f"   ‚Ä¢ Error handling ensures partial failures don't stop processing")
    print(f"   ‚Ä¢ Detailed logging provides full audit trail")
    print(f"   ‚Ä¢ Multiple output formats support different use cases")
    print(f"   ‚Ä¢ AI-powered insights provide actionable feedback")
    print(f"   ‚Ä¢ Metadata questions properly handled and excluded")
    print(f"   ‚Ä¢ Visual analytics support data-driven decisions")
    
    print("\n" + "="*70)
    print(f"‚úÖ Robust Step 6 completed successfully at {datetime.now().strftime('%H:%M:%S')}")
    print("üöÄ Ready for final distribution, analysis, and archival!")
    print("="*70)

# Generate final comprehensive summary
generate_final_summary()


üéâ ENHANCED STEP 6: POST-SCORING PACKAGING COMPLETE

üìä Processing Results:
   Total students processed: 4
   Average score: 11.00
   Passing students: 1 (25.0%)
   Score range: 7.0 - 20.0

üìÅ Generated Files:
   ‚úÖ Backup archive: VTC Test.zip
   ‚úÖ Individual PDFs: 4 created
   ‚úÖ Combined PDF: all.pdf
   ‚úÖ Sample collections: Multiple stratified samples
   ‚úÖ Comprehensive Excel reports with multiple sheets:
      ‚Ä¢ Marks, Answers, Reasoning (wide format)
      ‚Ä¢ Raw data for audit trail
      ‚Ä¢ AI-powered Performance reports
      ‚Ä¢ Class-level analytics and overview
      ‚Ä¢ Question-level metrics and statistics
   ‚úÖ Visual analytics: Question performance charts

ü§ñ AI-Robust Features:


NameError: name 'performance_df' is not defined