# Step 1: Generate Answer Sheets
Create personalized answer sheets by merging the name list into the template, exporting per-student PDFs, and combining them for printing.

**Features:**
- ‚úÖ Comprehensive validation of input files and data
- ‚úÖ Progress tracking with visual progress bars
- ‚úÖ Robust error handling and recovery
- ‚úÖ Detailed logging and reporting
- ‚úÖ Robust PDF merging with validation
- ‚úÖ Processing statistics and next steps guidance


In [8]:
from grading_utils import setup_paths, validate_required_files, validate_student_ids, print_validation_summary
import logging
import os

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

prefix = "VTC Test"
paths = setup_paths(prefix, "sample")

# Validate required files exist
missing_files = validate_required_files(paths)
if missing_files:
    print("‚ùå Setup validation failed!")
    for file in missing_files:
        print(f"  Missing: {file}")
    raise FileNotFoundError("Please ensure all required files are present.")

name_list = paths["name_list_file"]
answer_sheet = f"../sample/{prefix} Answer Sheet.docx"

# Validate answer sheet template exists
if not os.path.exists(answer_sheet):
    logger.error(f"Answer sheet template not found: {answer_sheet}")
    raise FileNotFoundError(f"Answer sheet template required: {answer_sheet}")

print("‚úÖ Setup validation passed")

‚úÖ Setup validation passed


In [9]:
import pandas as pd

try:
    df = pd.read_excel(name_list)
    logger.info(f"‚úì Loaded name list with {len(df)} students")
    
    # Display data for verification
    print("Student data preview:")
    display(df.head())
    
except Exception as e:
    logger.error(f"Failed to load name list: {e}")
    raise

2026-01-07 06:17:04,986 - INFO - ‚úì Loaded name list with 4 students


Student data preview:


Unnamed: 0,NAME,ID,CLASS
0,Peter,123456789,A
1,Mary,987654321,B
2,John,234567890,C
3,Susan,345678912,D


In [10]:
# Robust validation using utility functions
is_valid, errors = validate_student_ids(df)
print_validation_summary("Student Data Validation", is_valid, errors)

if not is_valid:
    raise ValueError("Student data validation failed")

# Check for required columns
required_columns = ['NAME', 'ID', 'CLASS']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    raise ValueError(f"Missing required columns: {missing_columns}")

print(f"‚úì Student data validation passed - {len(df)} unique students")


üìã Student Data Validation
‚úÖ VALIDATION PASSED
   All checks completed successfully
‚úì Student data validation passed - 4 unique students


In [11]:
from docx import Document
from pypdf import PdfWriter, PdfReader
import subprocess
import os
import time
from tqdm import tqdm
from datetime import datetime
import json

# Robust document processing with progress tracking and error recovery
output_dir = "../data"
os.makedirs(output_dir, exist_ok=True)

individual_pdfs = []
failed_students = []
start_time = time.time()

print("Processing student answer sheets...")
progress_bar = tqdm(df.iterrows(), total=len(df), desc="Creating PDFs")

for index, row in progress_bar:
    try:
        progress_bar.set_description(f"Processing {row['NAME']}")
        
        # Load the template
        doc = Document(answer_sheet)
        
        # Replace placeholders in all paragraphs (preserving formatting)
        for paragraph in doc.paragraphs:
            for run in paragraph.runs:
                if 'Name:' in run.text:
                    run.text = run.text.replace('Name:', f"Name: {row['NAME']}")
                if 'Student ID:' in run.text:
                    run.text = run.text.replace('Student ID:', f"Student ID: {row['ID']}")
                if 'Class:' in run.text:
                    run.text = run.text.replace('Class:', f"Class: {row['CLASS']}")
        
        # Replace placeholders in tables
        for table in doc.tables:
            for row_table in table.rows:
                for cell in row_table.cells:
                    for paragraph in cell.paragraphs:
                        for run in paragraph.runs:
                            if 'Name:' in run.text:
                                run.text = run.text.replace('Name:', f"Name: {row['NAME']}")
                            if 'Student ID:' in run.text:
                                run.text = run.text.replace('Student ID:', f"Student ID: {row['ID']}")
                            if 'Class:' in run.text:
                                run.text = run.text.replace('Class:', f"Class: {row['CLASS']}")
        
        # Save the modified docx using ID as filename (unique identifier)
        docx_filename = os.path.join(output_dir, f"{prefix} Answer Sheet - {row['ID']}.docx")
        doc.save(docx_filename)
        
        # Convert to PDF using LibreOffice with error handling
        pdf_filename = os.path.join(output_dir, f"{prefix} Answer Sheet - {row['ID']}.pdf")
        
        result = subprocess.run([
            'libreoffice', '--headless', '--convert-to', 'pdf', 
            '--outdir', output_dir, docx_filename
        ], capture_output=True, text=True, timeout=30)
        
        if result.returncode != 0:
            raise subprocess.CalledProcessError(result.returncode, 'libreoffice', result.stderr)
        
        # Verify PDF was created
        if not os.path.exists(pdf_filename):
            raise FileNotFoundError(f"PDF conversion failed for {row['NAME']}")
        
        individual_pdfs.append(pdf_filename)
        logger.info(f"‚úì Created PDF for {row['NAME']} (ID: {row['ID']})")
        
    except Exception as e:
        error_msg = f"Failed to process {row['NAME']}: {e}"
        logger.error(error_msg)
        failed_students.append(row['ID'])
        continue
    
    # Update progress bar
    progress_bar.set_postfix({
        'Success': len(individual_pdfs),
        'Failed': len(failed_students)
    })

processing_time = time.time() - start_time
print(f"\n‚úì Processing completed in {processing_time:.2f} seconds")

Processing student answer sheets...


Processing Peter:   0%|          | 0/4 [00:00<?, ?it/s]2026-01-07 06:17:08,411 - INFO - ‚úì Created PDF for Peter (ID: 123456789)
Processing Mary:  25%|‚ñà‚ñà‚ñå       | 1/4 [00:03<00:09,  3.32s/it, Success=1, Failed=0] 2026-01-07 06:17:14,223 - INFO - ‚úì Created PDF for Mary (ID: 987654321)
Processing John:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 2/4 [00:09<00:09,  4.79s/it, Success=2, Failed=0]2026-01-07 06:17:19,225 - INFO - ‚úì Created PDF for John (ID: 234567890)
Processing Susan:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 3/4 [00:14<00:04,  4.88s/it, Success=3, Failed=0]2026-01-07 06:17:25,101 - INFO - ‚úì Created PDF for Susan (ID: 345678912)
Processing Susan: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [00:20<00:00,  5.00s/it, Success=4, Failed=0]


‚úì Processing completed in 20.01 seconds





In [12]:
# Robust PDF merging with validation
def merge_pdfs_with_validation(pdf_files, output_path):
    """Merge PDFs with validation and error handling."""
    try:
        if not pdf_files:
            logger.error("No PDF files to merge")
            return False
        
        # Validate all PDFs exist and are readable
        valid_pdfs = []
        for pdf_file in pdf_files:
            if not os.path.exists(pdf_file):
                logger.warning(f"PDF file not found: {pdf_file}")
                continue
            
            try:
                # Test if PDF is readable
                reader = PdfReader(pdf_file)
                if len(reader.pages) == 0:
                    logger.warning(f"Empty PDF file: {pdf_file}")
                    continue
                valid_pdfs.append(pdf_file)
            except Exception as e:
                logger.warning(f"Corrupted PDF file {pdf_file}: {e}")
                continue
        
        if not valid_pdfs:
            logger.error("No valid PDF files to merge")
            return False
        
        # Merge valid PDFs
        writer = PdfWriter()
        total_pages = 0
        
        for pdf_file in valid_pdfs:
            reader = PdfReader(pdf_file)
            for page in reader.pages:
                writer.add_page(page)
                total_pages += 1
        
        # Write merged PDF
        with open(output_path, 'wb') as output:
            writer.write(output)
        
        logger.info(f"‚úì Merged {len(valid_pdfs)} PDFs into {output_path}")
        logger.info(f"  Total pages: {total_pages}")
        
        return True
        
    except Exception as e:
        logger.error(f"Failed to merge PDFs: {e}")
        return False

# Merge all PDFs
if individual_pdfs:
    output_file = os.path.join(output_dir, f"{prefix} Answer Sheets Combined.pdf")
    merge_success = merge_pdfs_with_validation(individual_pdfs, output_file)
    
    if merge_success:
        print(f"‚úÖ Combined PDF created successfully: {output_file}")
    else:
        print("‚ùå Failed to create combined PDF")
else:
    print("‚ùå No individual PDFs were created")
    merge_success = False

2026-01-07 06:17:25,249 - INFO - ‚úì Merged 4 PDFs into ../data/VTC Test Answer Sheets Combined.pdf
2026-01-07 06:17:25,254 - INFO -   Total pages: 8


‚úÖ Combined PDF created successfully: ../data/VTC Test Answer Sheets Combined.pdf


In [13]:
# Robust cleanup and reporting
print("\nCleaning up intermediate files...")
cleaned_count = 0
for _, row in df.iterrows():
    docx_file = os.path.join(output_dir, f"{prefix} Answer Sheet - {row['ID']}.docx")
    pdf_file = os.path.join(output_dir, f"{prefix} Answer Sheet - {row['ID']}.pdf")
    
    for file_path in [docx_file, pdf_file]:
        if os.path.exists(file_path):
            try:
                os.remove(file_path)
                cleaned_count += 1
            except Exception as e:
                logger.warning(f"Failed to remove {file_path}: {e}")

print(f"‚úì Cleanup complete! Removed {cleaned_count} intermediate files.")

# Generate processing report
report_data = {
    "exam_name": prefix,
    "processing_date": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "total_students": len(df),
    "successful_pdfs": len(individual_pdfs),
    "failed_pdfs": len(failed_students),
    "processing_time_seconds": round(processing_time, 2),
    "output_directory": output_dir,
    "combined_pdf_created": merge_success,
    "failed_student_ids": failed_students
}

# Save report
report_path = os.path.join(output_dir, f"{prefix}_answer_sheets_report.json")
with open(report_path, 'w') as f:
    json.dump(report_data, f, indent=2)

print(f"üìä Processing report saved to: {report_path}")


Cleaning up intermediate files...
‚úì Cleanup complete! Removed 8 intermediate files.
üìä Processing report saved to: ../data/VTC Test_answer_sheets_report.json
üìä Processing report saved to: ../data/VTC Test_answer_sheets_report.json


In [14]:
# Final summary and next steps
print(f"\n{'='*60}")
print("ANSWER SHEET GENERATION SUMMARY")
print(f"{'='*60}")
print(f"Total students: {report_data['total_students']}")
print(f"Successful PDFs: {report_data['successful_pdfs']}")
print(f"Failed PDFs: {report_data['failed_pdfs']}")
print(f"Processing time: {report_data['processing_time_seconds']}s")
print(f"Combined PDF: {'‚úÖ Created' if report_data['combined_pdf_created'] else '‚ùå Failed'}")

if report_data['failed_pdfs'] > 0:
    print(f"\n‚ö†Ô∏è  Failed student IDs: {report_data['failed_student_ids']}")
    print("   Please check the logs and retry failed students manually")

print(f"\nüìÅ Output files location: {output_dir}")
print(f"üìä Processing report: {report_path}")

# Provide next steps guidance
print(f"\n{'='*60}")
print("NEXT STEPS")
print(f"{'='*60}")

if report_data['failed_pdfs'] == 0 and report_data['combined_pdf_created']:
    print("‚úÖ All answer sheets generated successfully!")
    print("   1. Review the combined PDF for quality")
    print("   2. Print the combined PDF for distribution")
    print("   3. Proceed to Step 2 for marking scheme extraction")
else:
    print("‚ö†Ô∏è  Some issues occurred during processing:")
    if report_data['failed_pdfs'] > 0:
        print("   1. Check LibreOffice installation and permissions")
        print("   2. Verify answer sheet template format")
        print("   3. Retry failed students manually")
    if not report_data['combined_pdf_created']:
        print("   4. Check individual PDFs and retry merging")

print(f"{'='*60}")
print("üéâ Robust Step 1 completed!")


ANSWER SHEET GENERATION SUMMARY
Total students: 4
Successful PDFs: 4
Failed PDFs: 0
Processing time: 20.01s
Combined PDF: ‚úÖ Created

üìÅ Output files location: ../data
üìä Processing report: ../data/VTC Test_answer_sheets_report.json

NEXT STEPS
‚úÖ All answer sheets generated successfully!
   1. Review the combined PDF for quality
   2. Print the combined PDF for distribution
   3. Proceed to Step 2 for marking scheme extraction
üéâ Robust Step 1 completed!
