In [None]:
studentsScriptFileName = "Actual Test Scripts (after override).xlsx"
standAnswerFileName = "Actual Test Solution_override.xlsx"

# Prerequisite
Setup new Connda Environment
1. New terminal.
2. Run ```cd sagemaker-studiolab-notebooks/ ```
3. Run ```conda env create -f env_basic.yaml ```
4. Refresh your jupter notebook webpage.
5. Select the Kenel "basic".

To add new packages
1. Update "env_basic.yaml".
2. Run ```conda env update -f env_basic.yaml ```

Add AWS Academy Learner Lab and remember to renew it every 4 hours!
1. Rename AWSAcademyLeanerLab-template.config to AWSAcademyLeanerLab.config.
2. Copy AWS CLI credentials from AWS Academy Learner Lab.
3. Paste it in AWSAcademyLeanerLab.config.

In [None]:
import configparser

canCallAWS = False

config = configparser.ConfigParser()
config.read("AWSAcademyLeanerLab.config")
config.sections()
awsAccessKeyId = config["default"]["aws_access_key_id"]
awsSecretAccessKeyId = config["default"]["aws_secret_access_key"]
awsSessionToken = config["default"]["aws_session_token"]
if awsAccessKeyId == "" or awsSecretAccessKeyId == "" or awsSessionToken == "":
    print("Missing AWSAcademyLeanerLab credentials")
else:
    import boto3

    boto3.setup_default_session(
        aws_access_key_id=awsAccessKeyId,
        aws_secret_access_key=awsSecretAccessKeyId,
        aws_session_token=awsSessionToken,
        region_name="us-east-1",
    )
    awsAccount = boto3.client("sts").get_caller_identity().get("Account")
    print("Your AWS Account Number: " + awsAccount)
    canCallAWS = True

In [None]:
import numpy as np
import pandas as pd
import json
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as spc
from scipy import stats
import os

In [None]:
df_script = pd.read_excel(
    os.path.join(os.getcwd(), "data", studentsScriptFileName), sheet_name=None
)
df_answer = pd.read_excel(
    os.path.join(os.getcwd(), "data", standAnswerFileName), sheet_name=None
)

In [None]:
df_answer["DocumentValue"].head()
df_answer["DocumentValue"].columns

Columns appear in Standard answer but not in Student script, whihc is missing from Amazon Textract and probably because all students do not answer those question!

In [None]:
df_answer["DocumentValue"].columns.difference(df_script["PageAnswerGeometry"].columns)

Columns appear in student script but not in Standard answer, and you need to updated the answer excel.

In [None]:
df_script["PageAnswerGeometry"].columns.difference(df_answer["DocumentValue"].columns)

Common columns

In [None]:
columns = (
    df_script["PageAnswerGeometry"]
    .columns.intersection(df_answer["DocumentValue"].columns)
    .to_list()
)
columns

Apppend Student Name and ID columns

In [None]:
columns.insert(0, "Name")
columns.insert(0, "ID")
columns

Remove columns do not show in answer

In [None]:
for key in df_answer.keys():
    df_script[key] = df_script[key][columns]
df_answer.keys()

# Preprocessing boundbox with margin.

In [None]:
import os
from urllib.parse import urlparse
import urllib.request
from pathlib import Path
import urllib.request


path = Path(os.path.join(os.getcwd(), "output", "grading_form", "images"))
path.mkdir(parents=True, exist_ok=True)
images = {}
for rowIndex, row in df_script["DocumentAnswerImage"].iterrows():  # iterate over rows
    for columnIndex, value in row.items():
        if not pd.isna(value) and value not in images:
            url = urlparse(value)
            fileName = os.path.basename(url.path)
            fileName = os.path.join("output", "grading_form", "images", fileName)
            if not os.path.isfile(fileName):
                urllib.request.urlretrieve(value, fileName)
            images[value] = fileName

In [None]:
from PIL import Image

dir_path = Path(os.path.join(os.getcwd(), "output", "grading_form", "images"))


def mapper(x):
    if pd.isna(x) or x is None:
        return None
    j = json.loads(x)
    if j is None:
        return None
    if "BoundingBox" in j:
        if "page" in j:
            boundingBox = j["BoundingBox"]
            path = Path(os.path.join(dir_path, "p-" + str(j["page"]) + ".png"))
            with Image.open(path) as im:
                # The crop method from the Image module takes four coordinates as input.
                # The right can also be represented as (left+width)
                # and lower can be represented as (upper+height).
                pageWidth, pageHeight = im.size
                boundingBox["page"] = j["page"] + 1
                width, height = im.size

                margin = boundingBox["Height"] * 0.05
                leftMargin = margin + 50
                (left, top, width, height) = (
                    max(pageWidth * boundingBox["Left"] - leftMargin, 0),
                    max(pageHeight * boundingBox["Top"] - margin, 0),
                    min(
                        pageWidth * boundingBox["Width"] + leftMargin + margin,
                        pageWidth,
                    ),
                    min(pageHeight * boundingBox["Height"] + 2 * margin, pageHeight),
                )
                return {
                    "left": left,
                    "top": top,
                    "height": height,
                    "width": width,
                    "page": j["page"],
                }
        try:
            return json.loads(j)["BoundingBox"]
        except:
            print(type(j))


df_script["DocumentBoundingBox"] = df_script["DocumentAnswerGeometry"].applymap(mapper)
df_answer["DocumentBoundingBox"] = df_answer["DocumentAnswerGeometry"].applymap(mapper)

# Preparation for the imputation the missing page

In [None]:
def mapper(x):
    if pd.isna(x) or x is None:
        return 0
    else:
        return 1


df_script["PageAnswer"] = df_script["PageAnswerGeometry"].applymap(mapper)
df_script["PageAnswer"].head()

Create correlation matrix and apply clustering.

In [None]:
import scipy
import scipy.cluster.hierarchy as sch


def cluster_corr(corr_array, inplace=False):
    """
    Rearranges the correlation matrix, corr_array, so that groups of highly
    correlated variables are next to eachother

    Parameters
    ----------
    corr_array : pandas.DataFrame or numpy.ndarray
        a NxN correlation matrix

    Returns
    -------
    pandas.DataFrame or numpy.ndarray
        a NxN correlation matrix with the columns and rows rearranged
    """
    pairwise_distances = sch.distance.pdist(corr_array)
    linkage = sch.linkage(pairwise_distances, method="complete")
    cluster_distance_threshold = pairwise_distances.max() / 2
    idx_to_cluster_array = sch.fcluster(
        linkage, cluster_distance_threshold, criterion="distance"
    )
    idx = np.argsort(idx_to_cluster_array)

    if not inplace:
        corr_array = corr_array.copy()

    if isinstance(corr_array, pd.DataFrame):
        return corr_array.iloc[idx, :].T.iloc[idx, :]
    return corr_array[idx, :][:, idx]


corr = df_script["PageAnswer"].corr(method="pearson")
corr = cluster_corr(corr)
fig, ax = plt.subplots(figsize=(20, 20))
ax = sns.heatmap(
    corr,
    vmin=-1,
    vmax=1,
    center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
)
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, horizontalalignment="right");

A Page contains a set of question and it is a group.

In [None]:
pdist = spc.distance.pdist(corr)
linkage = spc.linkage(pdist, method="complete")
idx = spc.fcluster(linkage, 0.5 * pdist.max(), "distance")

grouping = pd.DataFrame(data={"question": corr.columns, "group": idx})
grouping

In [None]:
question_group = (
    grouping.groupby("group")["question"].apply(list).reset_index(name="questions")
)
question_group

Create ignore flag if the average of correlation is less than 0.5. Because Amazon Textract may capture some noise questions and it will become a group, the noise group correlation is very low.

In [None]:
def get_subgroup_average(questions, corr):
    subdf = corr[questions]
    return subdf[subdf.index.isin(questions)].mean().mean()


question_group["mean"] = question_group["questions"].apply(
    lambda x: get_subgroup_average(x, corr)
)
question_group["ignore"] = question_group["mean"] < 0.5
question_group

In [None]:
question_mapping = pd.merge(grouping, question_group, how="inner", on="group").rename(
    columns={"questions": "question_set"}
)
question_mapping = question_mapping.set_index("question")
question_mapping

By checking question_set, we can impute the missing page number by checking the page in the same set.

# Preparation for the imputation the missing bounding box

Exclude the ignore columns and calculates the ratio of missing value for each question.

In [None]:
columns = question_mapping[question_mapping["ignore"] == False].index
df_script["DocumentBoundingBoxFiltered"] = df_script["DocumentBoundingBox"][columns]
df_script["DocumentBoundingBoxFiltered"].isna().mean().round(4) * 100

In [None]:
def get_trim_mean(box, field):
    return stats.trim_mean(list(map(lambda x: x[field], box)), 0.05)


def trimed_mean_bound_box(boxes):
    box = list(filter(lambda box: box is not None, boxes))
    height = get_trim_mean(box, "height")
    left = get_trim_mean(box, "left")
    top = get_trim_mean(box, "top")
    width = get_trim_mean(box, "width")

    return {"height": height, "left": left, "top": top, "width": width}


trimedMeanBoundBoxes = df_script["DocumentBoundingBoxFiltered"].agg(
    trimed_mean_bound_box, axis=0
)
frame = {"boundingBox": trimedMeanBoundBoxes}
estimatedBoundBoxes = pd.DataFrame(frame)
estimatedBoundBoxes.head()

# Imputation for the missing Bounding box

In [None]:
df_script["DocumentBoundingBoxImpute"] = df_script["DocumentBoundingBox"].copy()

In [None]:
from statistics import mode


def impute_page_and_boundbox(df, row_number, row):
    for key, cell in row.items():
        if cell is None:
            estimatedBoundBox = estimatedBoundBoxes.loc[[key]]["boundingBox"][0]
            questionSet = question_mapping.loc[[key]]["question_set"][0]
            page = mode(
                map(
                    lambda x: x["page"],
                    filter(lambda x: x is not None, map(lambda q: row[q], questionSet)),
                )
            )
            estimatedBoundBox = {
                "left": estimatedBoundBox["left"],
                "top": estimatedBoundBox["top"],
                "height": estimatedBoundBox["height"],
                "width": estimatedBoundBox["width"],
                "page": page,
            }
            df.at[row_number, key] = estimatedBoundBox


for i, j in df_script["DocumentBoundingBox"].iterrows():
    impute_page_and_boundbox(df_script["DocumentBoundingBoxImpute"], i, j)
df_script["DocumentBoundingBoxImpute"].head()

# Imputation Image path

In [None]:
df_script["DocumentAnswerImageLocal"] = df_script["DocumentAnswerImage"]
df_script["DocumentAnswerImageLocal"] = df_script["DocumentAnswerImageLocal"].applymap(
    lambda x: images[x], na_action="ignore"
)

df_script["DocumentAnswerImageLocal"].head()

In [None]:
import math

path = Path(os.path.join(os.getcwd(), "output", "grading_form", "images"))


def impute_image_path(df, row_number, row):
    for key, cell in row.items():
        if cell is None:
            questionSet = question_mapping.loc[[key]]["question_set"][0]
            page = mode(
                map(
                    lambda x: x["page"],
                    filter(lambda x: x is not None, map(lambda q: row[q], questionSet)),
                )
            )
            df.at[row_number, key] = os.path.join(
                "output", "grading_form", "images", "p-" + str(page) + ".png"
            )


for i, j in df_script["DocumentBoundingBox"].iterrows():
    impute_image_path(df_script["DocumentAnswerImageLocal"], i, j)
df_script["DocumentAnswerImageLocal"].head()

# Preprocessing Choice style questions

In [None]:
estimatedBoundBoxesOptions = estimatedBoundBoxes[
    estimatedBoundBoxes.index.str.endswith("-Yes")
    | estimatedBoundBoxes.index.str.endswith("-No")
    | estimatedBoundBoxes.index.str.endswith("-A")
    | estimatedBoundBoxes.index.str.endswith("-B")
    | estimatedBoundBoxes.index.str.endswith("-C")
    | estimatedBoundBoxes.index.str.endswith("-D")
    | estimatedBoundBoxes.index.str.endswith("-E")
].copy()
estimatedBoundBoxesOptions["question"] = estimatedBoundBoxesOptions.index
estimatedBoundBoxesOptions[["question", "option"]] = estimatedBoundBoxesOptions[
    "question"
].str.split("-", 1, expand=True)
estimatedBoundBoxesOptions.head()

In [None]:
from shapely import geometry
from functools import reduce


def to_polygon(boundingbox):
    p1 = geometry.Point(boundingbox["left"], boundingbox["top"])
    p2 = geometry.Point(boundingbox["left"] + boundingbox["width"], boundingbox["top"])
    p3 = geometry.Point(boundingbox["left"], boundingbox["top"] + boundingbox["height"])
    p4 = geometry.Point(
        boundingbox["left"] + boundingbox["width"],
        boundingbox["top"] + boundingbox["height"],
    )
    pointList = [p1, p2, p3, p4, p1]
    return geometry.Polygon(pointList)


def to_combined_boundingbox(boundingboxes):
    combined = reduce(
        lambda x, y: x.union(y), map(lambda b: to_polygon(b), boundingboxes)
    )
    x_min = combined.bounds[0]
    y_min = combined.bounds[1]
    x_max = combined.bounds[2]
    y_max = combined.bounds[3]

    boundingbox = {
        "left": x_min,
        "top": y_min,
        "width": x_max - x_min,
        "height": y_max - y_min,
    }
    return boundingbox


questionOptions = estimatedBoundBoxesOptions["question"].to_list()
groupedOptions = estimatedBoundBoxesOptions.groupby(["question"]).apply(dict).to_dict()

combined_boundingbox = dict(
    map(
        lambda q: (q, to_combined_boundingbox(groupedOptions[q]["boundingBox"])),
        groupedOptions,
    )
)

In [None]:
import math

estimatedOptions = pd.DataFrame(
    combined_boundingbox.items(), columns=["question", "boundingBox"]
).set_index("question")
questionToAnswer = df_answer["DocumentValue"].iloc[0].to_dict()
for key, value in questionToAnswer.items():
    # do something with value
    questionToAnswer[key] = value if type(value) is str else ""

choices = ["Yes", "No", "A", "B", "C", "D", "E"]


def get_answer(row):
    queston = row.name
    return set(
        filter(
            lambda x: x != "",
            map(
                lambda x: x
                if queston + "-" + x in questionToAnswer
                and questionToAnswer[queston + "-" + x] == "X"
                else "",
                choices,
            ),
        )
    )


def get_choices(row):
    queston = row.name
    return list(
        filter(
            lambda x: x != "",
            map(
                lambda x: queston + "-" + x
                if queston + "-" + x in questionToAnswer
                else "",
                choices,
            ),
        )
    )


estimatedOptions["answers"] = estimatedOptions.apply(get_answer, axis=1)
estimatedOptions["choices"] = estimatedOptions.apply(get_choices, axis=1)
estimatedOptions

In [None]:
questionToAnswer

# Generate Marking form

Check for regeneration

In [None]:
import os

questionDir = os.path.join(os.getcwd(), "output", "grading_form", "questions")
questionAndControl = {}
for path, currentDirectory, files in os.walk(questionDir):
    for file in files:
        if file == "control.json":
            question = path[len(questionDir) + 1 :]
            f = open(os.path.join(path, file))
            data = json.load(f)
            if "regenerate" in data:
                questionAndControl[question] = data
            f.close()

questionAndControl

Crop Answer image and ORC again.

Easy Orc supports difference languages and you can check https://github.com/JaidedAI/EasyOCR

This step takes time!

In [None]:
from PIL import Image, ImageEnhance
from IPython.display import display
import easyocr
import tempfile
import boto3

easyocrLanguages = ["en"]
reader = easyocr.Reader(easyocrLanguages, gpu=True)
rekognition = boto3.client('rekognition')
minConfidence = 70

df_script["DocumentValueOverrided"] = df_script[
    "DocumentValue"
].copy()

def get_rekognition_detect_text(imagePath):
    with open(imagePath, 'rb') as image:
        response = rekognition.detect_text(
            Image={
                'Bytes': image.read()
            },
            Filters={
                'WordFilter': {
                    'MinConfidence': minConfidence
                }
            }
        )
        textDetections=response['TextDetections']        
        texts = []
        for text in textDetections:
                if text['Type'] == 'LINE':
                    texts.append(text['DetectedText'])
        text = " ".join(texts)        
        return text
    
def ocr_question_image(image,overrdiedBoundingBox):
    with Image.open(image) as im:
        # The crop method from the Image module takes four coordinates as input.
        # The right can also be represented as (left+width)
        # and lower can be represented as (upper+height).
        (left, top, right, lower) = (
            overrdiedBoundingBox["left"],
            overrdiedBoundingBox["top"],
            overrdiedBoundingBox["left"] + overrdiedBoundingBox["width"],
            overrdiedBoundingBox["top"] + overrdiedBoundingBox["height"],
        )
        # Here the image "im" is cropped and assigned to new variable im_crop
        im_crop = im.crop((left, top, right, lower))        
        imageEnhance = ImageEnhance.Sharpness(im_crop)  
        # showing resultant image
        im_crop = imageEnhance.enhance(3)
        im_crop.save("temp.png", format="png")
        # display(im_crop)
        # this needs to run only once to load the model into memory
    result = reader.readtext("temp.png", detail=0)
    easyocrText = "".join(result)
    text = easyocrText
    if easyocrText != "" and canCallAWS:
        rekognitionText = get_rekognition_detect_text("temp.png")
        text = rekognitionText if rekognitionText !="" else easyocrText
    return text


for question, control in questionAndControl.items():
    if control["regenerate"] == "on" and control["boundingBoxMode"] != "tractract":
        print(question, control)
        if control["boundingBoxMode"] == "manual":
            overrdiedBoundingBox = {
                "left": float(control["left"]),
                "top": float(control["top"]),
                "height": float(control["height"]),
                "width": float(control["width"]),
            }
        else:
            overrdiedBoundingBox = estimatedBoundBoxes.loc[[question]]["boundingBox"][0]       
        images = df_script["DocumentAnswerImageLocal"][question].to_list()
        
        texts = []
        for image in images:
            text = ocr_question_image(image, overrdiedBoundingBox)
            # print(text)
            # display(im_crop)
            texts.append(text)
        df_script["DocumentValueOverrided"][question] = texts

Remove NaN to empty string.

In [None]:
df_script["DocumentValueOverrided"].fillna('', inplace=True)

Recalculate the answer similarity

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

df_script["DocumentAnswerSimilarityOverrided"] = df_script[
    "DocumentAnswerSimilarity"
].copy()
for question, control in questionAndControl.items():
    if control["regenerate"] != "on":
        continue
    print("Recalculate Similarity for " + question)
    overridedStandardAnswer = control["overridedStandardAnswer"]
    answers = df_script["DocumentValueOverrided"][question].to_list()
    answers.insert(0, overridedStandardAnswer)
    # Compute embeddings
    embeddings = model.encode(answers, convert_to_tensor=True)
    # Compute cosine-similarities for each sentence with each other sentence
    cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
    # Find the pairs with the highest cosine similarity scores
    pairs = []
    for j in range(0, len(cosine_scores)):
        pairs.append(float(cosine_scores[0][j]))   
    #Empty answer similarity must be 0.
    l = list(map(lambda x: (x[0],0) if x[0] == "" else x, zip(answers, pairs)))
    similarties = list(list(zip(*l))[1])
    similarties.pop(0)  
    df_script["DocumentAnswerSimilarityOverrided"][question] = similarties      
        

In [None]:
def extract_table(dataframeName, question, newName):
    tempTable = df_script[dataframeName].copy()
    tempTable = tempTable[[question]]
    tempTable.reset_index(inplace=True)
    tempTable["RowNumber"] = tempTable.index
    tempTable.rename(columns={question: newName}, inplace=True)
    return tempTable


def get_question_df(question):
    answerTable = extract_table("DocumentValueOverrided", question, "Answer")
    imageTable = extract_table("DocumentAnswerImageLocal", question, "Image")
    confidenceTable = extract_table("DocumentConfidence", question, "Confidence")
    similarityTable = extract_table(
        "DocumentAnswerSimilarityOverrided", question, "Similarity"
    )
    geometryTable = extract_table("DocumentBoundingBoxImpute", question, "BoundingBox")

    dataTable = pd.merge(
        answerTable, imageTable, on="RowNumber", suffixes=("", "_remove")
    )
    dataTable = pd.merge(
        dataTable, confidenceTable, on="RowNumber", suffixes=("", "_remove")
    )
    dataTable = pd.merge(
        dataTable, similarityTable, on="RowNumber", suffixes=("", "_remove")
    )
    dataTable = pd.merge(
        dataTable, geometryTable, on="RowNumber", suffixes=("", "_remove")
    )
    dataTable.drop(
        [i for i in dataTable.columns if "_remove" in i], axis=1, inplace=True
    )

    dataTable["Similarity"] = dataTable["Similarity"].fillna(0)
    dataTable["Answer"] = dataTable["Answer"].fillna("")
    dataTable["Image"] = dataTable["Image"].str.replace("output/grading_form/", "")

    dataTable = dataTable.sort_values(by=["Similarity"], ascending=False)
    return dataTable


def save_template_output(output, question, filename):
    path = Path(
        os.path.join(os.getcwd(), "output", "grading_form", "questions", question)
    )
    path.mkdir(parents=True, exist_ok=True)
    path = Path(
        os.path.join(
            os.getcwd(), "output", "grading_form", "questions", question, filename
        )
    )
    text_file = open(path, "w")
    text_file.write(output)
    text_file.close()

In [None]:
from distutils.dir_util import copy_tree

from_directory = os.path.join(os.getcwd(), "templates", "javascript")
to_directory = os.path.join(os.getcwd(), "output", "grading_form", "javascript")
copy_tree(from_directory, to_directory)

In [None]:
from jinja2 import Template
from jinja2 import Environment, FileSystemLoader

file_loader = FileSystemLoader("templates")
env = Environment(loader=file_loader)
template = env.get_template("index.html")

questions = df_script["DocumentBoundingBox"].columns.to_list()


textAnswer = []
for question in questions:
    optionsDf = estimatedBoundBoxesOptions[
        estimatedBoundBoxesOptions.index.str.fullmatch(question)
    ]
    if optionsDf.empty:  # skip choice questions
        textAnswer.append(question)
optionAnswer = estimatedOptions.index.to_list()

questionOfMarks = textAnswer + optionAnswer

output = template.render(
    studentsScriptFileName=studentsScriptFileName,
    textAnswer=textAnswer,
    optionAnswer=optionAnswer,
)
# open text file
path = Path(os.path.join(os.getcwd(), "output", "grading_form", "questions"))
path.mkdir(parents=True, exist_ok=True)
path = Path(os.path.join(os.getcwd(), "output", "grading_form", "index.html"))
text_file = open(path, "w")
text_file.write(output)
text_file.close()

Generate grading form for text answer questions.

In [None]:
questions = df_script["DocumentBoundingBox"].columns.to_list()

for question in questions:
    optionsDf = estimatedBoundBoxesOptions[
        estimatedBoundBoxesOptions.index.str.fullmatch(question)
    ]
    if not optionsDf.empty:  # skip choice questions
        continue

    dataTable = get_question_df(question)

    standardAnswer = None
    estimatedBoundingBox = None
    if question in df_answer["DocumentValue"].columns:
        standardAnswer = questionToAnswer[question]
    if question in df_script["DocumentBoundingBox"].columns:
        estimatedBoundingBox = estimatedBoundBoxes.loc[[question]]["boundingBox"][0]

    if question == "ID" or question == "Name":
        template = env.get_template("questions/index-answer.html")
    else:
        template = env.get_template("questions/index.html")
    output = template.render(
        studentsScriptFileName=studentsScriptFileName,
        question=question,
        standardAnswer=standardAnswer,
        estimatedBoundingBox=estimatedBoundingBox,
        dataTable=dataTable,
    )
    save_template_output(output, question, "index.html")

    template = env.get_template("questions/question.js")
    output = template.render(
        dataTable=dataTable,
        estimatedBoundingBox=estimatedBoundingBox,
    )
    save_template_output(output, question, "question.js")

    template = env.get_template("questions/style.css")
    output = template.render(
        dataTable=dataTable,
    )
    save_template_output(output, question, "style.css")

Generate grading form for choices style questions.

In [None]:
for index, row in estimatedOptions.iterrows():
    question = index
    choices = row["choices"]
    estimatedBoundingBox = row["boundingBox"]
    answers = row["answers"]
    answersMask = list(
        map(lambda c: 1 if c.replace(question + "-", "") in answers else 0, choices)
    )
    choicesDf = list(map(lambda o: (o, get_question_df(o)), choices))

    rowNumber = choicesDf[0][1]["RowNumber"]
    images = choicesDf[0][1]["Image"]

    df = pd.DataFrame(
        list(zip(rowNumber, images)), columns=["RowNumber", "Image"], index=rowNumber
    )

    choiceDict = {}
    for index, item in enumerate(choicesDf):
        q, qdf = item
        # Remove Image for option as it is a common column and convert to dict
        choiceDict[q] = qdf.drop(["Image"], axis=1).to_dict("index")

    def get_confidence(index):
        return min(map(lambda c: choiceDict[c][index]["Confidence"], choices))

    def get_answer(index):
        return ",".join(
            map(
                lambda c: "1" if choiceDict[c][index]["Answer"] == "X" else "0", choices
            )
        )

    df["choices"] = pd.Series([choices for x in range(len(df.index))])
    df["BoundingBox"] = pd.Series([estimatedBoundingBox for x in range(len(df.index))])
    df["answers"] = pd.Series([answers for x in range(len(df.index))])
    df["Answer"] = pd.Series([get_answer(x) for x in range(len(df.index))])
    df["Confidence"] = pd.Series([get_confidence(x) for x in range(len(df.index))])

    template = env.get_template("questions/index-choices.html")
    output = template.render(
        studentsScriptFileName=studentsScriptFileName,
        question=question,
        standardAnswer=" ".join(answers),
        answersMask=answersMask,
        estimatedBoundingBox=estimatedBoundingBox,
        dataTable=df,
        choiceDict=choiceDict,
    )

    save_template_output(output, question, "index.html")

    template = env.get_template("questions/question.js")
    output = template.render(
        dataTable=dataTable,
        estimatedBoundingBox=estimatedBoundingBox,
    )
    save_template_output(output, question, "question.js")

    template = env.get_template("questions/style.css")
    output = template.render(
        dataTable=dataTable,
    )
    save_template_output(output, question, "style.css")

# Start Python HTTPServer

Copy your Juptyer notebook url i.e. 

https://xxxxx.studio.us-east-2.sagemaker.aws/studiolab/default/jupyter/lab/tree/sagemaker-studiolab-notebooks/marking_form_builder.ipynb

Paste into a new broswer tab and change it to: 

https://xxxxx.studio.us-east-2.sagemaker.aws/studiolab/default/jupyter/proxy/8000/

When you finished your grading tasks, you need to stop the webserver with Interrupt Kernel.

The webserver log is in output/server.log.

If you are in development and don't want the notebook being blocked by running webserver, you can open a terminal and run the below 2 commands.

cd sagemaker-studiolab-notebooks/

python server.py 8000

In [None]:
# !python server.py 8000

# Backup grading result

In [None]:
import shutil

shutil.make_archive(
    os.path.join(os.getcwd(), "output", "grading_form", "questions"),
    "zip",
    os.path.join(os.getcwd(), "output", "grading_form", "questions"),
)
shutil.make_archive(
    os.path.join(os.getcwd(), "output"),
    "zip",
    os.path.join(os.getcwd(), "output"),
)

In [None]:
display("Dowload question html files.")
display(os.path.join(os.getcwd(), "output", "grading_form", "questions.zip"))
display("Dowload grading form files with everything.")
display(os.path.join(os.getcwd(), "output.zip"))

# Generate Marksheet

In [None]:
import os

questionDir = os.path.join(os.getcwd(), "output", "grading_form", "questions")
questionAndMarks = {}
for path, currentDirectory, files in os.walk(questionDir):
    for file in files:
        if file == "mark.json":
            question = path[len(questionDir) + 1 :]
            f = open(os.path.join(path, file))
            data = json.load(f)
            marks = {}
            for i in data:
                marks[i["id"]] = (
                    i["overridedMark"] if i["overridedMark"] != "" else i["mark"]
                )
            questionAndMarks[question] = marks
            f.close()
marksDf = pd.DataFrame(questionAndMarks)
marksDf = marksDf[
    ["ID", "Name"]
    + [col for col in sorted(marksDf.columns) if col != "ID" and col != "Name"]
]
print(marksDf)

The questionWithoutMarks should be an empty list and the list items are the question without mark.

In [None]:
questionWithoutMarks = list(set(questionOfMarks) - set(marksDf.columns))
questionWithoutMarks

Clean all cell outputs before commit to GitHub.

```jupyter nbconvert --clear-output --inplace grading_form_builder.ipynb```