# Step 6: Post-Scoring Packaging
Backup results, generate reports, produce scored PDFs, and collect samples for sharing. Run after completing scoring and checks.

In [1]:
from grading_utils import setup_paths, create_directories

prefix = "VTC Test"
paths = setup_paths(prefix, "sample")

# Extract commonly used paths
pdf_file = paths["pdf_file"]
name_list_file = paths["name_list_file"]
base_path = paths["base_path"]
base_path_images = paths["base_path_images"]
base_path_annotations = paths["base_path_annotations"]
base_path_questions = paths["base_path_questions"]
base_path_marked_images = paths["base_path_marked_images"]
base_path_marked_pdfs = paths["base_path_marked_pdfs"]
base_path_marked_scripts = paths["base_path_marked_scripts"]

# Create all necessary directories
create_directories(paths)

## Backup grading result
Remove version history, before you backup.

In [2]:
import os
for path, currentDirectory, files in os.walk(base_path_questions):
    for file in files:
        if file.startswith("control-") or file.startswith("mark-"):
            os.remove(os.path.join(path, file))

Zip the website.

In [3]:
import shutil
shutil.make_archive(base_path,"zip",base_path)

'/home/user/AI-Handwrite-Grader/marking_form/VTC Test.zip'

# Generate Score Report

Check the ID and Name pages to verify the values before generate the marksheet.

In [4]:
import os
import json
import pandas as pd

from grading_utils import build_student_id_mapping

# Load name list as authoritative source for student names
name_list_df = pd.read_excel(name_list_file, sheet_name="Name List")
id_col = next((col for col in name_list_df.columns if col.lower() == "id"), None)
name_col = next((col for col in name_list_df.columns if col.lower() in ["name", "student name", "student_name"]), None)
if id_col is None or name_col is None:
    raise ValueError("Name list must contain ID and NAME columns.")
name_map = (
    name_list_df.assign(**{id_col: name_list_df[id_col].astype(str)})
    .set_index(id_col)[name_col]
    .astype(str)
    .to_dict()
)

# Build student ID mapping using utility function
pageToStudentId, numberOfPage, getStudentId = build_student_id_mapping(
    base_path_questions, base_path_annotations
)

questionAndMarks = {}
for path, currentDirectory, files in os.walk(base_path_questions):
    for file in files:
        if file == "mark.json":
            question = path[len(base_path_questions) + 1 :]
            f = open(os.path.join(path, file))
            data = json.load(f)
            marks = {}
            for i in data:
                studentId = getStudentId(int(i["id"]))
                marks[studentId] = (
                    i["overridedMark"] if i["overridedMark"] != "" else i["mark"]
                )
            questionAndMarks[question] = marks
            f.close()
marksDf = pd.DataFrame(questionAndMarks)
marksDf = marksDf[
    ["ID", "NAME", "CLASS"]
    + [
        col
        for col in sorted(marksDf.columns)
        if col != "ID" and col != "NAME" and col != "CLASS"
    ]
]

# Prefer names from the uploaded name list, fallback to marked value when missing
marksDf["ID"] = marksDf["ID"].astype(str)
marksDf["NAME"] = marksDf["ID"].map(name_map).fillna(marksDf["NAME"])

marksDf["Marks"] = (
    marksDf.loc[:, ~marksDf.columns.isin(["ID", "NAME", "CLASS"])]
    .apply(pd.to_numeric)
    .sum(axis=1)
)
print(marksDf)

                  ID   NAME CLASS   Q1    Q2   Q3   Q4   Q5  Marks
234567890  234567890   John     C  1.0   1.0  4.0  0.0  4.0   10.0
123456789  123456789  Peter     A  2.0   9.0  0.0  8.0  3.0   22.0
987654321  987654321   Mary     B  1.0  10.0  0.0  0.0  0.0   11.0
345678912  345678912  Susan     D  2.0  10.0  0.0  0.0  0.0   12.0


# Create Scored Scripts

Copy raw images to marked folder

In [5]:
import shutil
import os

if os.path.exists(base_path_marked_images):
    shutil.rmtree(base_path_marked_images)
shutil.copytree(base_path_images, base_path_marked_images)

'../marking_form/VTC Test/marked/images/'

In [6]:
import json
annotations_path = base_path_annotations + "annotations.json"
with open(annotations_path, "r") as f: 
    annotations = json.load(f)          

#flatten annotations to list 
annotations_list = []
for page in annotations:
    for annotation in annotations[page]:
        annotation["page"] = int(page)
        # x to left, y to top
        annotation["left"] = annotation["x"]
        annotation["top"] = annotation["y"]
        annotation.pop("x")
        annotation.pop("y")
        annotations_list.append(annotation) 
annotations_list

# convert annotations_list to dict with key with label
annotations_dict = {}
for annotation in annotations_list:
    annotations_dict[annotation["label"]] = annotation
annotations_dict


{'Q1': {'width': 1189,
  'height': 257,
  'label': 'Q1',
  'page': 0,
  'left': 222,
  'top': 608},
 'Q2': {'width': 1189,
  'height': 292,
  'label': 'Q2',
  'page': 0,
  'left': 222,
  'top': 865},
 'Q3': {'width': 1189,
  'height': 253,
  'label': 'Q3',
  'page': 0,
  'left': 222,
  'top': 1157},
 'NAME': {'width': 171,
  'height': 52,
  'label': 'NAME',
  'page': 0,
  'left': 341,
  'top': 473},
 'ID': {'width': 216,
  'height': 46,
  'label': 'ID',
  'page': 0,
  'left': 1167,
  'top': 472},
 'CLASS': {'width': 83,
  'height': 48,
  'label': 'CLASS',
  'page': 0,
  'left': 327,
  'top': 535},
 'Q4': {'width': 1189,
  'height': 260,
  'label': 'Q4',
  'page': 1,
  'left': 223,
  'top': 547},
 'Q5': {'width': 1189,
  'height': 288,
  'label': 'Q5',
  'page': 1,
  'left': 223,
  'top': 807}}

In [7]:
studentIdToPage={}
with open(os.path.join(base_path_questions, "ID", "mark.json")) as f:
    data = json.load(f)
    for i in data:
        studentId = i["overridedMark"] if i["overridedMark"] != "" else i["mark"]
        studentIdToPage[studentId] = int(i["id"])
studentIdToPage

{'123456789': 0, '987654321': 2, '234567890': 4, '345678912': 6}

In [8]:
import cv2
from IPython.display import display
from ipywidgets import IntProgress


# Covert marksDf to dict
marksDf_list = marksDf.to_dict(orient="records")

f = IntProgress(min=0, max=len(marksDf_list))  # instantiate the bar
display(f)  # display the bar

for student in marksDf_list:
    first_page = studentIdToPage[student["ID"]]
    for annotation in annotations_dict:
        value = student[annotation]
        if annotation == "ID":
            value = value + " Marks: " + str(student["Marks"])
        x = annotations_dict[annotation]["left"]
        y = annotations_dict[annotation]["top"]
        page = first_page + annotations_dict[annotation]["page"]
      
        image_path = base_path_marked_images + str(page) + ".jpg"
        # print(value, x, y, imagePath)
        img = cv2.imread(image_path)
        textSize = cv2.getTextSize(text=str(value), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, thickness=2)
        height = textSize[0][1]
        cv2.putText(img, str(value), (x, y + height), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
        cv2.imwrite(image_path, img)
    f.value += 1      

IntProgress(value=0, max=4)

In [9]:
from PIL import Image

for student in marksDf_list:
    studentId = student["ID"]
    first_page = studentIdToPage[student["ID"]]
    last_page = first_page + numberOfPage - 1
    print(studentId, first_page, last_page)
    pdf_path = base_path_marked_pdfs + studentId + ".pdf"

    images = list(map(Image.open, [base_path_marked_images + str(i) + ".jpg" for i in range(first_page, last_page + 1)]))
    images[0].save(pdf_path, save_all=True, append_images=images[1:]) 
        

234567890 4 5
123456789 0 1
987654321 2 3
345678912 6 7


# Generate Script Sample

5 set Samples:
1. Combined scripts
2. 3 Good, 3 Average, and 3 Weak.
3. 5 Good, 5 Average, and 5 Weak.
4. 3 Good, 3 Average, and 3 Weak above the passing mark.
5. 5 Good, 5 Average, and 5 Weak above the passing mark. 

In [10]:
passingMark = 15

In [11]:
from PyPDF4 import PdfFileMerger

writer = PdfFileMerger(strict=True)

# merge all pdfs in base_path_marked_pdfs
for path, currentDirectory, files in os.walk(base_path_marked_pdfs):
    for file in files:
        if file.endswith(".pdf"):
            pdf_path = os.path.join(path, file)
            writer.append(pdf_path)
writer.write(base_path_marked_scripts + "all.pdf")           

In [12]:
from PyPDF4 import PdfFileMerger, PdfFileReader

sampling = marksDf.sort_values(by=["Marks"], ascending=False)["Marks"]

from_directory = os.path.join(os.getcwd(), "..", "templates", "pdf")

goodPage = PdfFileReader(from_directory + "/Good.pdf")
averagePage = PdfFileReader(from_directory + "/Average.pdf")
weakPage = PdfFileReader(from_directory + "/Weak.pdf")


def get_scripts_psf(df):
    return list(map(lambda rowNumber: base_path_marked_pdfs + rowNumber + ".pdf", df.index))


def take_sample(n, sampling, suffix=""):
    if len(sampling) < 3 * n:
        n = int(len(sampling) / 3)
    good = sampling.head(n)
    weak = sampling.tail(n)
    median = int(len(sampling) / 2)
    take = int(n / 2)
    average = sampling.iloc[median - take : median + take]

    merger = PdfFileMerger()
    merger.append(goodPage)
    for pdf in get_scripts_psf(good):
        merger.append(PdfFileReader(pdf))
    merger.append(averagePage)
    for pdf in get_scripts_psf(average):
        merger.append(PdfFileReader(pdf))
    merger.append(weakPage)
    for pdf in get_scripts_psf(weak):
        merger.append(PdfFileReader(pdf))
    fileName = base_path_marked_scripts + "sampleOf" + str(n) + suffix + ".pdf"
    merger.write(open(fileName, "wb"))
    print("Output successfully written to" + fileName)
    merger.close()


take_sample(3, sampling)
take_sample(5, sampling)

sampling = sampling.where(lambda x: x > passingMark)
take_sample(3, sampling, "_only_pass")
take_sample(5, sampling, "_only_pass")

Output successfully written to../marking_form/VTC Test/marked/scripts/sampleOf1.pdf
Output successfully written to../marking_form/VTC Test/marked/scripts/sampleOf1.pdf
Output successfully written to../marking_form/VTC Test/marked/scripts/sampleOf1_only_pass.pdf
Output successfully written to../marking_form/VTC Test/marked/scripts/sampleOf1_only_pass.pdf


In [13]:
import pandas as pd
import re


def clean_answer_text(val: str) -> str:
    """Strip leading numbering and drop standalone question labels like Q2."""
    if not isinstance(val, str):
        return val
    lines = [ln.strip() for ln in str(val).splitlines()]
    cleaned = []
    for ln in lines:
        ln = re.sub(r"^\s*\d+\s*[\.|\)]\s*", "", ln)
        if re.fullmatch(r"q\d+", ln, flags=re.IGNORECASE):
            continue
        if ln:
            cleaned.append(ln)
    return "\n".join(cleaned).strip()


def collect_answers_and_reasoning():
    """Gather per-student answers and model reasoning from each question's CSV and pivot to a mark-style wide format."""
    answer_rows = []
    reasoning_rows = []

    for path, currentDirectory, files in os.walk(base_path_questions):
        if "data.csv" not in files:
            continue

        question = path[len(base_path_questions) + 1 :]
        data_path = os.path.join(path, "data.csv")
        df = pd.read_csv(data_path)
        if "page" not in df.columns:
            continue

        # Map scanned page back to student ID using the existing helper
        df["StudentID"] = df["page"].apply(
            lambda p: getStudentId(int(str(p).split(".")[0])) if pd.notna(p) else None
        )

        for _, row in df.iterrows():
            sid = row.get("StudentID")
            if sid is None:
                continue

            raw_answer = row.get("Answer", "")
            answer_val = clean_answer_text(raw_answer)
            source_page = row.get("page", "")
            row_number = row.get("RowNumber", "")

            answer_rows.append(
                {
                    "ID": str(sid),
                    "Question": question,
                    "Answer": answer_val,
                    "SourcePage": source_page,
                    "RowNumber": row_number,
                }
            )

            reasoning_rows.append(
                {
                    "ID": str(sid),
                    "Question": question,
                    "Reasoning": row.get("Reasoning", ""),
                    "Similarity": row.get("Similarity", ""),
                    "ModelMark": row.get("Mark", ""),
                    "Answer": answer_val,
                    "SourcePage": source_page,
                    "RowNumber": row_number,
                }
            )

    answers_df = pd.DataFrame(answer_rows)
    reasoning_df = pd.DataFrame(reasoning_rows)

    if not answers_df.empty:
        answers_df = answers_df.sort_values(
            by=["ID", "Question", "SourcePage", "RowNumber"]
        ).reset_index(drop=True)
    if not reasoning_df.empty:
        reasoning_df = reasoning_df.sort_values(
            by=["ID", "Question", "SourcePage", "RowNumber"]
        ).reset_index(drop=True)

    # Preserve ID/NAME/CLASS to match marks layout
    meta_cols = ["ID", "NAME", "CLASS"]
    student_meta = marksDf[meta_cols].drop_duplicates().set_index("ID")

    # Keep question ordering aligned with marks sheet
    question_cols = [
        col
        for col in marksDf.columns
        if col not in ["ID", "NAME", "CLASS", "Marks"]
    ]

    # Wide answers: one row per student, one column per question
    answers_wide = student_meta.copy()
    if not answers_df.empty:
        answers_pivot = answers_df.pivot_table(
            index="ID", columns="Question", values="Answer", aggfunc="first"
        )
        answers_pivot = answers_pivot.reindex(columns=question_cols)
        answers_wide = answers_wide.join(answers_pivot)
        answers_wide = answers_wide.reset_index()

    # Wide reasoning: only the reasoning text per question (matches marks layout)
    reasoning_wide = student_meta.copy()
    if not reasoning_df.empty:
        reasoning_pivot = reasoning_df.pivot_table(
            index="ID", columns="Question", values="Reasoning", aggfunc="first"
        )
        reasoning_pivot = reasoning_pivot.reindex(columns=question_cols)
        reasoning_wide = reasoning_wide.join(reasoning_pivot)
        reasoning_wide = reasoning_wide.reset_index()

    return answers_wide, reasoning_wide, answers_df, reasoning_df


answers_sheet, reasoning_sheet, answers_raw, reasoning_raw = collect_answers_and_reasoning()

In [14]:
details_report_path = base_path_marked_scripts + "details_score_report.xlsx"

# Multi-sheet Excel: marks, answers (wide), reasoning (wide) + raw long-form for audit
with pd.ExcelWriter(details_report_path) as writer:
    marksDf.to_excel(writer, sheet_name="Marks", index=False)
    answers_sheet.to_excel(writer, sheet_name="Answers", index=False)
    reasoning_sheet.to_excel(writer, sheet_name="Reasoning", index=False)
    answers_raw.to_excel(writer, sheet_name="AnswersRaw", index=False)
    reasoning_raw.to_excel(writer, sheet_name="ReasoningRaw", index=False)

# Lightweight summary sheet remains unchanged
marksDf[["ID", "NAME", "CLASS", "Marks"]].to_excel(
    base_path_marked_scripts + "score_report.xlsx", index=False
)

print("Saved:", details_report_path)

Saved: ../marking_form/VTC Test/marked/scripts/details_score_report.xlsx


## Gemini Performance Report
Use Gemini to generate a concise per-student performance report (question text, marking scheme, awarded marks, captured answer) and store it as a new `Performance` sheet inside `details_score_report.xlsx` (alongside Marks/Answers/Reasoning).

In [15]:
import os
import json
import hashlib
from grading_utils import (
    init_gemini_client,
    get_cache_key,
    get_from_cache,
    save_to_cache,
    create_gemini_config,
)

# Initialize Gemini client
client = init_gemini_client()

marking_scheme_file = paths["marking_scheme_file"]
marking_scheme_df = pd.read_excel(marking_scheme_file, sheet_name="Marking Scheme")
required_cols = {"question_number", "question_text", "marking_scheme", "marks"}
missing_cols = required_cols - set(marking_scheme_df.columns)
if missing_cols:
    raise ValueError(f"Marking scheme is missing columns: {missing_cols}")

def normalize_question_label(val):
    label = str(val).strip()
    if label.upper().startswith("Q"):
        return label
    try:
        as_float = float(label)
        if as_float.is_integer():
            return f"Q{int(as_float)}"
    except Exception:
        pass
    return f"Q{label}"

marking_scheme_df["Question"] = marking_scheme_df["question_number"].apply(normalize_question_label)
question_meta = (
    marking_scheme_df
    .set_index("Question")[
        ["question_text", "marking_scheme", "marks"]
    ]
    .to_dict(orient="index")
)

question_cols = [col for col in marksDf.columns if col not in ["ID", "NAME", "CLASS", "Marks"]]
answers_lookup = {}
if not answers_sheet.empty:
    for _, row in answers_sheet.iterrows():
        answers_lookup[str(row["ID"])] = {
            col: ("" if pd.isna(row.get(col, "")) else str(row.get(col, "")))
            for col in question_cols
        }

cache_dir = paths["cache_dir"]

# Create Gemini config using utility function
config = create_gemini_config(temperature=0.35, top_p=0.9, max_output_tokens=1536)

def build_question_section(student_row, student_answers):
    blocks = []
    for q, meta in sorted(question_meta.items()):
        awarded_mark = student_row.get(q, "")
        if pd.isna(awarded_mark):
            awarded_mark = ""
        answer_text = student_answers.get(q, "")
        blocks.append(
            f"Question {q} (max {meta.get('marks', '')}):\n"
            f"Prompt: {meta.get('question_text', '')}\n"
            f"Marking scheme: {meta.get('marking_scheme', '')}\n"
            f"Awarded mark: {awarded_mark}\n"
            f"Student answer:\n{answer_text or '[no answer captured]'}"
        )
    return "\n\n---\n\n".join(blocks)

def generate_student_report(student_row):
    sid = str(student_row.get("ID", ""))
    student_answers = answers_lookup.get(sid, {})
    question_section = build_question_section(student_row, student_answers)
    prompt = f"""You are an instructor drafting a concise performance report.

Student: {sid} - {student_row.get('NAME', '')} (Class: {student_row.get('CLASS', '')})
Total score: {student_row.get('Marks', '')}

Use the question details, marking schemes, awarded marks, and answers below:
{question_section}

Write:
- 2-3 sentence overall summary of strengths and weaknesses.
- One short bullet per question with actionable feedback tied to the marking scheme.
- 2 concrete next-step study suggestions focused on the weakest skills.
Keep it under 220 words and avoid restating the input verbatim."""
    payload_hash = hashlib.sha256(question_section.encode("utf-8")).hexdigest()
    cache_key = get_cache_key(
        "performance_report",
        model="gemini-3-flash-preview",
        student_id=sid,
        payload_hash=payload_hash,
    )
    cached = get_from_cache(cache_key, cache_dir)
    if cached is not None:
        return cached.get("report", "")
    response = client.models.generate_content(
        model="gemini-3-flash-preview",
        contents=[{"role": "user", "parts": [{"text": prompt}]}],
        config=config,
    )
    report_text = response.text if response.text else ""
    save_to_cache(cache_key, {"report": report_text}, cache_dir)
    return report_text

performance_rows = []
for _, row in marksDf.iterrows():
    report_text = generate_student_report(row)
    performance_rows.append(
        {
            "ID": row.get("ID", ""),
            "NAME": row.get("NAME", ""),
            "CLASS": row.get("CLASS", ""),
            "Marks": row.get("Marks", ""),
            "PerformanceReport": report_text,
        }
    )

performance_df = pd.DataFrame(performance_rows)

# Append/replace Performance sheet inside the existing details_score_report.xlsx
writer_kwargs = {"sheet_name": "Performance", "index": False}
if os.path.exists(details_report_path):
    with pd.ExcelWriter(
        details_report_path,
        mode="a",
        engine="openpyxl",
        if_sheet_exists="replace",
    ) as writer:
        performance_df.to_excel(writer, **writer_kwargs)
else:
    performance_df.to_excel(details_report_path, **writer_kwargs)

print("Saved Performance sheet in:", details_report_path)
performance_df.head()

âœ“ Vertex AI Express Mode initialized
Saved Performance sheet in: ../marking_form/VTC Test/marked/scripts/details_score_report.xlsx


Unnamed: 0,ID,NAME,CLASS,Marks,PerformanceReport
0,234567890,John,C,10.0,**Performance Report: John (Class C)**\n\n**Ov...
1,123456789,Peter,A,22.0,**Performance Report**\n\n**Student:** Peter (...
2,987654321,Mary,B,11.0,**Performance Report: Mary (987654321)**\n\n**...
3,345678912,Susan,D,12.0,**Performance Report: Susan (345678912)**\n\nS...


In [16]:
from IPython.display import FileLink 

# zip base_path_marked_scripts folder
script_zip = base_path_marked_scripts + "../scripts"
shutil.make_archive(script_zip, "zip", base_path_marked_scripts)
FileLink(script_zip + ".zip")