# Step 5: Post-Scoring Checks
Verify every question has marks, validate IDs against the Name List, and clean versioned mark/control files before packaging.

**Features:**
- ‚úÖ Comprehensive validation with detailed reporting
- ‚úÖ Automatic error detection and suggestions
- ‚úÖ Safe file operations with backup options
- ‚úÖ Detailed logging and statistics
- ‚úÖ Color-coded output for easy review
- ‚úÖ Batch operations with progress tracking

In [1]:
from grading_utils import setup_paths, create_directories
from termcolor import colored
import pandas as pd
import os
import json
import logging
from datetime import datetime
from pathlib import Path
from typing import List, Dict, Tuple

# Robust logging setup
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

print("‚úÖ Robust Step 5: Post-Scoring Checks initialized")
print(f"‚úì Session started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Configure dataset prefix and data root
prefix = "VTC Test"
dataset = "sample"

# Resolve paths
paths = setup_paths(prefix, dataset)
base_path_questions = paths["base_path_questions"]
name_list_file = paths["name_list_file"]

# Ensure directories exist
create_directories(paths)

# Load Name List with validation
try:
    if not os.path.exists(name_list_file):
        raise FileNotFoundError(f"Name list file not found: {name_list_file}")
    
    name_list_df = pd.read_excel(name_list_file, sheet_name="Name List")
    
    # Validate Name List structure
    required_columns = ["ID"]
    missing_columns = [col for col in required_columns if col not in name_list_df.columns]
    if missing_columns:
        raise ValueError(f"Missing required columns in Name List: {missing_columns}")
    
    logger.info(f"‚úì Loaded Name List from {name_list_file}")
    logger.info(f"  Total students: {len(name_list_df)}")
    
    print(f"‚úì Loaded Name List: {len(name_list_df)} students")
    
except Exception as e:
    logger.error(f"‚ùå Failed to load Name List: {e}")
    print(colored(f"‚ùå Failed to load Name List: {e}", "red"))
    name_list_df = pd.DataFrame()

print("\n" + "="*60)
print("üîç POST-SCORING VALIDATION CHECKS")
print("="*60)

2026-01-05 03:38:28,448 - INFO - ‚úì Loaded Name List from ../sample/VTC Test Name List.xlsx
2026-01-05 03:38:28,449 - INFO -   Total students: 4


‚úÖ Robust Step 5: Post-Scoring Checks initialized
‚úì Session started at: 2026-01-05 03:38:28
‚úì Loaded Name List: 4 students

üîç POST-SCORING VALIDATION CHECKS


In [2]:
# Robust Mark Validation with Detailed Reporting

def validate_marks() -> Tuple[List[str], Dict[str, List[int]]]:
    """
    Validate that all questions have been marked
    
    Returns:
        Tuple of (unfinished_questions, questions_with_empty_marks)
    """
    logger.info("Starting mark validation...")
    
    # Metadata questions that don't need marking validation
    metadata_questions = ["NAME", "ID", "CLASS"]
    
    unfinished_scoring = []
    questions_with_empty_marks = {}
    total_questions = 0
    marked_questions = 0
    
    try:
        for path, current_directory, files in os.walk(base_path_questions):
            if path == base_path_questions:
                continue
            
            total_questions += 1
            question = path[len(base_path_questions) + 1:]
            
            # Skip validation for metadata questions
            if question in metadata_questions:
                logger.info(f"Skipping validation for metadata question: {question}")
                continue
            
            # Check if mark.json exists
            if "mark.json" not in files:
                unfinished_scoring.append(question)
                logger.warning(f"Missing mark.json for question: {question}")
                continue
            
            # Validate marks in mark.json
            try:
                mark_file = os.path.join(path, "mark.json")
                with open(mark_file, "r", encoding='utf-8') as f:
                    marks = json.load(f)
                
                # Validate marks structure
                if not isinstance(marks, list):
                    logger.error(f"Invalid marks structure for {question}: not a list")
                    unfinished_scoring.append(question)
                    continue
                
                # Check each mark entry
                empty_indices = []
                for idx, mark in enumerate(marks):
                    if not isinstance(mark, dict):
                        logger.warning(f"Invalid mark entry at index {idx} for {question}")
                        continue
                    
                    mark_value = mark.get('mark', '')
                    overrided_mark = mark.get('overridedMark', '')
                    
                    if mark_value == "" and overrided_mark == "":
                        empty_indices.append(idx + 1)
                
                if empty_indices:
                    questions_with_empty_marks[question] = empty_indices
                    unfinished_scoring.append(question)
                    logger.warning(f"Question {question} has {len(empty_indices)} empty marks")
                else:
                    marked_questions += 1
                    
            except json.JSONDecodeError as e:
                logger.error(f"Invalid JSON in mark.json for {question}: {e}")
                unfinished_scoring.append(question)
            except Exception as e:
                logger.error(f"Error reading marks for {question}: {e}")
                unfinished_scoring.append(question)
        
        # Report results
        print(f"\nüìä Mark Validation Results:")
        print(f"   Total questions: {total_questions}")
        print(f"   Metadata questions (skipped): {len(metadata_questions)}")
        print(f"   Graded questions: {total_questions - len(metadata_questions)}")
        print(f"   Fully marked: {marked_questions}")
        print(f"   Incomplete: {len(unfinished_scoring)}")
        
        if unfinished_scoring:
            print(colored(f"\n‚ö†Ô∏è {len(unfinished_scoring)} question(s) have incomplete marking:", "yellow"))
            for question in unfinished_scoring:
                if question in questions_with_empty_marks:
                    indices = questions_with_empty_marks[question]
                    print(colored(f"   ‚Ä¢ {question}: {len(indices)} empty mark(s) at positions {indices}", "red"))
                else:
                    print(colored(f"   ‚Ä¢ {question}: mark.json missing", "red"))
        else:
            print(colored("\n‚úÖ All graded questions have been marked!", "green"))
        
        return unfinished_scoring, questions_with_empty_marks
        
    except Exception as e:
        logger.error(f"Mark validation failed: {e}")
        print(colored(f"‚ùå Mark validation failed: {e}", "red"))
        return [], {}

# Run mark validation
unfinished_questions, empty_marks = validate_marks()

2026-01-05 03:38:28,466 - INFO - Starting mark validation...
2026-01-05 03:38:28,468 - INFO - Skipping validation for metadata question: CLASS
2026-01-05 03:38:28,469 - INFO - Skipping validation for metadata question: ID
2026-01-05 03:38:28,470 - INFO - Skipping validation for metadata question: NAME



üìä Mark Validation Results:
   Total questions: 8
   Metadata questions (skipped): 3
   Graded questions: 5
   Fully marked: 5
   Incomplete: 0
[32m
‚úÖ All graded questions have been marked![0m


In [3]:
# Robust ID Validation with Cross-Checking

def validate_student_ids() -> Tuple[List[str], List[str], List[str]]:
    """
    Validate student IDs against the name list
    
    Returns:
        Tuple of (missing_from_marks, marked_but_not_in_list, duplicate_ids)
    """
    logger.info("Starting ID validation...")
    
    try:
        # Load marked IDs
        id_mark_file = os.path.join(base_path_questions, "ID", "mark.json")
        
        if not os.path.exists(id_mark_file):
            logger.error("ID mark.json file not found")
            print(colored("‚ùå ID mark.json file not found!", "red"))
            return [], [], []
        
        with open(id_mark_file, "r", encoding='utf-8') as f:
            marks = json.load(f)
        
        # Extract marked IDs (prefer overridedMark if available)
        id_from_mark = []
        for mark in marks:
            mark_value = mark.get("overridedMark", "") or mark.get("mark", "")
            if mark_value:
                id_from_mark.append(str(mark_value).strip())
        
        # Get IDs from name list
        if name_list_df.empty:
            logger.warning("Name list is empty, skipping ID validation")
            return [], [], []
        
        id_from_namelist = [str(id).strip() for id in name_list_df["ID"].tolist()]
        
        # Check for duplicates in marked IDs
        duplicate_ids = []
        seen_ids = set()
        for id_val in id_from_mark:
            if id_val in seen_ids:
                if id_val not in duplicate_ids:
                    duplicate_ids.append(id_val)
            seen_ids.add(id_val)
        
        # Find missing IDs (in name list but not marked)
        mark_missing_id = [id_val for id_val in id_from_namelist if id_val not in id_from_mark]
        
        # Find extra IDs (marked but not in name list)
        marked_but_not_in_namelist = [id_val for id_val in id_from_mark if id_val not in id_from_namelist]
        
        # Report results
        print(f"\nüìä ID Validation Results:")
        print(f"   Students in name list: {len(id_from_namelist)}")
        print(f"   Students marked: {len(id_from_mark)}")
        print(f"   Unique marked IDs: {len(seen_ids)}")
        
        if duplicate_ids:
            print(colored(f"\n‚ö†Ô∏è Duplicate IDs found: {duplicate_ids}", "red"))
            print(colored("   Action required: Check for scanning errors or duplicate submissions", "yellow"))
        
        if mark_missing_id:
            print(colored(f"\n‚ö†Ô∏è {len(mark_missing_id)} student(s) in name list but not marked:", "yellow"))
            for id_val in mark_missing_id:
                student_info = name_list_df[name_list_df["ID"].astype(str) == id_val]
                if not student_info.empty and "Name" in student_info.columns:
                    name = student_info.iloc[0]["Name"]
                    print(colored(f"   ‚Ä¢ ID {id_val}: {name} (possibly absent)", "red"))
                else:
                    print(colored(f"   ‚Ä¢ ID {id_val} (possibly absent)", "red"))
        
        if marked_but_not_in_namelist:
            print(colored(f"\n‚ö†Ô∏è {len(marked_but_not_in_namelist)} marked ID(s) not in name list:", "yellow"))
            for id_val in marked_but_not_in_namelist:
                print(colored(f"   ‚Ä¢ ID {id_val} (check for OCR errors or wrong class)", "red"))
            print(colored("   Action required: Verify these IDs manually", "yellow"))
        
        if not duplicate_ids and not mark_missing_id and not marked_but_not_in_namelist:
            print(colored("\n‚úÖ All student IDs validated successfully!", "green"))
        
        return mark_missing_id, marked_but_not_in_namelist, duplicate_ids
        
    except Exception as e:
        logger.error(f"ID validation failed: {e}")
        print(colored(f"‚ùå ID validation failed: {e}", "red"))
        return [], [], []

# Run ID validation
missing_ids, extra_ids, duplicate_ids = validate_student_ids()

2026-01-05 03:38:28,485 - INFO - Starting ID validation...



üìä ID Validation Results:
   Students in name list: 4
   Students marked: 4
   Unique marked IDs: 4
[32m
‚úÖ All student IDs validated successfully![0m


In [4]:
# Robust Version History Cleanup with Safety Checks

def cleanup_version_history(dry_run: bool = False) -> Tuple[int, List[str]]:
    """
    Remove versioned mark and control files
    
    Args:
        dry_run: If True, only report what would be deleted without actually deleting
    
    Returns:
        Tuple of (files_removed, file_list)
    """
    logger.info(f"Starting version history cleanup (dry_run={dry_run})...")
    
    files_to_remove = []
    
    try:
        for path, current_directory, files in os.walk(base_path_questions):
            for file in files:
                # Match versioned files: control-*.json or mark-*.json
                if (file.startswith("control-") or file.startswith("mark-")) and file.endswith(".json"):
                    file_path = os.path.join(path, file)
                    files_to_remove.append(file_path)
        
        print(f"\nüìÅ Version History Cleanup:")
        print(f"   Files found: {len(files_to_remove)}")
        
        if not files_to_remove:
            print(colored("   ‚úì No version history files to clean", "green"))
            return 0, []
        
        if dry_run:
            print(colored(f"\n   DRY RUN - Would remove {len(files_to_remove)} file(s):", "yellow"))
            for file_path in files_to_remove[:10]:  # Show first 10
                print(f"      ‚Ä¢ {os.path.relpath(file_path, base_path_questions)}")
            if len(files_to_remove) > 10:
                print(f"      ... and {len(files_to_remove) - 10} more")
            print(colored("\n   Set dry_run=False to actually remove files", "yellow"))
            return 0, files_to_remove
        
        # Actually remove files
        removed_count = 0
        failed_removals = []
        
        for file_path in files_to_remove:
            try:
                os.remove(file_path)
                removed_count += 1
            except Exception as e:
                logger.error(f"Failed to remove {file_path}: {e}")
                failed_removals.append(file_path)
        
        print(colored(f"\n   ‚úì Removed {removed_count} version history file(s)", "green"))
        
        if failed_removals:
            print(colored(f"   ‚ö†Ô∏è Failed to remove {len(failed_removals)} file(s)", "yellow"))
            for file_path in failed_removals[:5]:
                print(f"      ‚Ä¢ {os.path.relpath(file_path, base_path_questions)}")
        
        return removed_count, files_to_remove
        
    except Exception as e:
        logger.error(f"Version history cleanup failed: {e}")
        print(colored(f"‚ùå Cleanup failed: {e}", "red"))
        return 0, []

# Run cleanup (dry run first to preview)
print("\n" + "="*60)
print("üßπ VERSION HISTORY CLEANUP")
print("="*60)

# Preview what would be deleted
removed_count, file_list = cleanup_version_history(dry_run=True)

# Uncomment the line below to actually remove files
# removed_count, file_list = cleanup_version_history(dry_run=False)

2026-01-05 03:38:28,503 - INFO - Starting version history cleanup (dry_run=True)...



üßπ VERSION HISTORY CLEANUP

üìÅ Version History Cleanup:
   Files found: 0
[32m   ‚úì No version history files to clean[0m


In [5]:
# Generate Comprehensive Statistics and Summary

def generate_statistics() -> Dict:
    """Generate comprehensive statistics about the scoring process"""
    
    stats = {
        "total_questions": 0,
        "marked_questions": 0,
        "total_students": len(name_list_df) if not name_list_df.empty else 0,
        "marked_students": 0,
        "total_marks_awarded": 0,
        "questions_summary": {}
    }
    
    # Metadata questions that don't need submission counting
    metadata_questions = ["NAME", "ID", "CLASS"]
    
    try:
        for path, current_directory, files in os.walk(base_path_questions):
            if path == base_path_questions:
                continue
            
            question = path[len(base_path_questions) + 1:]
            stats["total_questions"] += 1
            
            if "mark.json" in files:
                try:
                    with open(os.path.join(path, "mark.json"), "r") as f:
                        marks = json.load(f)
                    
                    marked_count = 0
                    total_marks = 0
                    
                    for mark in marks:
                        mark_value = mark.get('overridedMark', '') or mark.get('mark', '')
                        if mark_value != '':
                            marked_count += 1
                            try:
                                total_marks += float(mark_value)
                            except (ValueError, TypeError):
                                pass
                    
                    if marked_count == len(marks):
                        stats["marked_questions"] += 1
                    
                    # Only add to summary if not a metadata question
                    if question not in metadata_questions:
                        stats["questions_summary"][question] = {
                            "total_submissions": len(marks),
                            "marked_submissions": marked_count,
                            "total_marks": total_marks
                        }
                    
                    if question == "ID":
                        stats["marked_students"] = marked_count
                    
                except Exception as e:
                    logger.warning(f"Error reading stats for {question}: {e}")
        
        return stats
        
    except Exception as e:
        logger.error(f"Failed to generate statistics: {e}")
        return stats

# Generate and display statistics
print("\n" + "="*60)
print("üìä SCORING STATISTICS")
print("="*60)

stats = generate_statistics()

print(f"\nüìù Overall Summary:")
print(f"   Total questions: {stats['total_questions']}")
print(f"   Fully marked questions: {stats['marked_questions']}")
print(f"   Students in name list: {stats['total_students']}")
print(f"   Students with marks: {stats['marked_students']}")

if stats['questions_summary']:
    print(f"\nüìã Per-Question Summary:")
    for question, summary in sorted(stats['questions_summary'].items()):
        completion = (summary['marked_submissions'] / summary['total_submissions'] * 100) if summary['total_submissions'] > 0 else 0
        print(f"   {question}:")
        print(f"      Submissions: {summary['marked_submissions']}/{summary['total_submissions']} ({completion:.1f}%)")
        print(f"      Total marks awarded: {summary['total_marks']:.1f}")
    
    print(f"\nüí° Note: NAME, ID, and CLASS are metadata fields and excluded from this summary.")


üìä SCORING STATISTICS

üìù Overall Summary:
   Total questions: 8
   Fully marked questions: 7
   Students in name list: 4
   Students with marks: 4

üìã Per-Question Summary:
   Q1:
      Submissions: 4/4 (100.0%)
      Total marks awarded: 6.0
   Q2:
      Submissions: 4/4 (100.0%)
      Total marks awarded: 31.0
   Q3:
      Submissions: 4/4 (100.0%)
      Total marks awarded: 5.0
   Q4:
      Submissions: 4/4 (100.0%)
      Total marks awarded: 10.0
   Q5:
      Submissions: 4/4 (100.0%)
      Total marks awarded: 9.0

üí° Note: NAME, ID, and CLASS are metadata fields and excluded from this summary.


In [6]:
# Final Summary and Recommendations

print("\n" + "="*60)
print("‚úÖ POST-SCORING CHECKS COMPLETE")
print("="*60)

# Metadata questions that don't need marking validation
metadata_questions = ["NAME", "ID", "CLASS"]

# Filter out metadata questions from unfinished list
actual_unfinished = [q for q in unfinished_questions if q not in metadata_questions]

# Determine overall status
all_checks_passed = (
    len(actual_unfinished) == 0 and
    len(missing_ids) == 0 and
    len(extra_ids) == 0 and
    len(duplicate_ids) == 0
)

if all_checks_passed:
    print(colored("\nüéâ All validation checks passed!", "green"))
    print("\n‚úì Ready for next steps:")
    print("   1. Run version history cleanup (uncomment in Cell 5)")
    print("   2. Backup the output directory")
    print("   3. Proceed to Step 6: Scoring Postprocessing")
else:
    print(colored("\n‚ö†Ô∏è Some issues require attention:", "yellow"))
    
    if actual_unfinished:
        print(f"\n   ‚Ä¢ {len(actual_unfinished)} question(s) with incomplete marking")
        for q in actual_unfinished:
            if q in empty_marks:
                print(f"     - {q}: {len(empty_marks[q])} empty mark(s)")
            else:
                print(f"     - {q}: mark.json missing")
        print("     Action: Review and complete marking in the web interface")
    
    if missing_ids:
        print(f"\n   ‚Ä¢ {len(missing_ids)} student(s) in name list but not marked")
        print("     Action: Verify if these students were absent")
    
    if extra_ids:
        print(f"\n   ‚Ä¢ {len(extra_ids)} marked ID(s) not in name list")
        print("     Action: Check for OCR errors or wrong class submissions")
    
    if duplicate_ids:
        print(f"\n   ‚Ä¢ {len(duplicate_ids)} duplicate ID(s) found")
        print("     Action: Check for scanning errors or duplicate submissions")
    
    print("\n   After resolving issues, re-run this notebook to verify.")

# Show metadata info if any were flagged
metadata_flagged = [q for q in unfinished_questions if q in metadata_questions]
if metadata_flagged:
    print(f"\nüí° Note: {', '.join(metadata_flagged)} are metadata fields and don't require marking.")

print("\n" + "="*60)
print(f"Session completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*60)


‚úÖ POST-SCORING CHECKS COMPLETE
[32m
üéâ All validation checks passed![0m

‚úì Ready for next steps:
   1. Run version history cleanup (uncomment in Cell 5)
   2. Backup the output directory
   3. Proceed to Step 6: Scoring Postprocessing

Session completed at: 2026-01-05 03:38:28


### ‚ö†Ô∏è DANGER ZONE: Reset Everything

**WARNING**: The code below will delete all mark.json and control.json files, resetting all manual corrections!

Only use this if you need to completely restart the marking process.

In [7]:
# ‚ö†Ô∏è DANGER: Uncomment to reset all marks and controls
# This will DELETE all mark.json and control.json files!

# def reset_all_marks(confirm: bool = False):
#     """Reset all marks - USE WITH EXTREME CAUTION"""
#     if not confirm:
#         print(colored("Safety check: Set confirm=True to actually reset", "yellow"))
#         return
#     
#     removed_count = 0
#     for path, current_directory, files in os.walk(base_path_questions):
#         for file in files:
#             if file == "control.json" or file == "mark.json":
#                 try:
#                     os.remove(os.path.join(path, file))
#                     removed_count += 1
#                 except Exception as e:
#                     logger.error(f"Failed to remove {file}: {e}")
#     
#     print(colored(f"‚ö†Ô∏è Removed {removed_count} mark/control files", "red"))
#     print("All marks have been reset. Re-run Step 4 to regenerate.")
# 
# # Uncomment and set confirm=True to actually reset
# # reset_all_marks(confirm=False)

print("‚úì Reset function available but safely commented out")

‚úì Reset function available but safely commented out
