# Preprocessing Student Answer
Create scoring web apps and validate the scoring result.

In [None]:
pdf_file = "../data/TestScript.pdf"
standard_answer = pdf_file.replace(".pdf", ".xlsx")

In [None]:
!sudo apt-get update
!sudo apt-get install ffmpeg libsm6 libxext6 -y

In [None]:
import os

file_name = os.path.basename(pdf_file)
file_name = os.path.splitext(file_name)[0]
base_path = "../marking_form/" + file_name
base_path_images = base_path + "/images/"
base_path_annotations = base_path+"/annotations/"
base_path_questions = base_path+"/questions"
base_path_javascript = base_path+"/javascript"

# create directory tree for base_path_images
os.makedirs(base_path_images, exist_ok=True)
os.makedirs(base_path_annotations, exist_ok=True)
os.makedirs(base_path_questions, exist_ok=True)
os.makedirs(base_path_javascript, exist_ok=True)

In [None]:
import json
annotations_path = base_path_annotations + "annotations.json"
with open(annotations_path, "r") as f: 
    annotations = json.load(f)          

#flatten annotations to list 
annotations_list = []
for page in annotations:
    for annotation in annotations[page]:
        annotation["page"] = int(page)
        # x to left, y to top
        annotation["left"] = annotation["x"]
        annotation["top"] = annotation["y"]
        annotation.pop("x")
        annotation.pop("y")
        annotations_list.append(annotation) 
annotations_list

# convert annotations_list to dict with key with label
annotations_dict = {}
for annotation in annotations_list:
    annotations_dict[annotation["label"]] = annotation
# annotations_dict


In [None]:
# extract list of label from annotations as questions
questions = []
for annotation in annotations_list:
    if annotation["label"] not in questions:
        questions.append(annotation["label"])
# remove 'NAME', 'ID', 'CLASS' if exists in questions
if 'NAME' in questions:
    questions.remove('NAME')
if 'ID' in questions:
    questions.remove('ID')
if 'CLASS' in questions:
    questions.remove('CLASS')    

# sort questions 
questions.sort()
question_with_answer = questions.copy()
questions = ['NAME', 'ID', 'CLASS'] + questions
questions

## Validate Provided Standard Answer for each question

In [None]:
## load standard_answer to dataframe
import pandas as pd
name_list_df = pd.read_excel(standard_answer, sheet_name="NameList")
standard_answer_df = pd.read_excel(standard_answer, sheet_name="Answer")
standard_answer_df.head()

In [None]:
from termcolor import colored

# check question_with_answer in standard_answer_df Question column
for question in question_with_answer:
    if question not in standard_answer_df["Question"].values:
        print(colored("Question {} is not in standard_answer!".format(question), 'red'))

for question in standard_answer_df["Question"].values:
    if question not in question_with_answer:
        print(colored("Question {} is not in annotations!".format(question), 'red'))
            

In [None]:
standard_answer = standard_answer_df.set_index("Question").to_dict()["Answer"]
standard_answer

Check for the regeneration of question.

In [None]:
import os
import json

questionAndControl = {}
for path, currentDirectory, files in os.walk(base_path_questions):
    for file in files:
        if file == "control.json":
            question = path[len(base_path_questions) + 1 :]
            f = open(os.path.join(path, file))
            data = json.load(f)
            if "regenerate" in data:
                questionAndControl[question] = data
            f.close()

questionAndControl

In [None]:
from distutils.dir_util import copy_tree
import shutil
import os

from_directory = os.path.join(os.getcwd(), "..","templates", "javascript")
copy_tree(from_directory, base_path_javascript)
ico = os.path.join(os.getcwd(), "..","templates", "favicon.ico")
# copy ico file  to base_path
shutil.copyfile(ico, base_path+"/favicon.ico")

Generate the index.html

In [None]:
from pathlib import Path
from jinja2 import Environment, FileSystemLoader

file_loader = FileSystemLoader("../templates")
env = Environment(loader=file_loader)
template = env.get_template("index.html")

output = template.render(
    studentsScriptFileName=file_name,
    textAnswer=questions,
    optionAnswer=[],
)
# open text file
path = Path(os.path.join(base_path, "index.html"))
text_file = open(path, "w")
text_file.write(output)
text_file.close()

In [None]:
import easyocr
import tempfile
from PIL import Image, ImageEnhance
import os

easyocrLanguages = ["en"]
reader = easyocr.Reader(easyocrLanguages, gpu=True)

def ocr_image_from_file(image_path, left, top, width, height):
    imageFile = tempfile.NamedTemporaryFile(suffix=".png").name
    with Image.open(image_path) as im:
        # The crop method from the Image module takes four coordinates as input.
        # The right can also be represented as (left+width)
        # and lower can be represented as (upper+height).
        (left, top, right, lower) = (
            left,
            top,
            left + width,
            top + height,
        )
        # Here the image "im" is cropped and assigned to new variable im_crop
        im_crop = im.crop((left, top, right, lower))
        imageEnhance = ImageEnhance.Sharpness(im_crop)
        # showing resultant image
        im_crop = imageEnhance.enhance(3)
        im_crop.save(imageFile, format="png")
    result = reader.readtext(imageFile, detail=0)
    easyocrText = "".join(result)
    text = easyocrText   
    os.remove(imageFile)
    return text

### Get embedding 
You can change the logic and model according to https://www.sbert.net/ 

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

def calculate_similarity(answers, question):
    # Add the standard answer to the head of list.

    if question not in standard_answer:
        ## return list of 0 in len of answers
        return [0] * len(answers)

    answers.insert(0, standard_answer[question])
    # Compute embeddings
    embeddings = model.encode(answers, convert_to_tensor=True)
    # Compute cosine-similarities for each sentence with each other sentence
    cosine_scores = util.pytorch_cos_sim(embeddings, embeddings)
    # Find the pairs with the highest cosine similarity scores
    pairs = []
    for j in range(0, len(cosine_scores)):
        pairs.append(float(cosine_scores[0][j]))
    # Empty answer similarity must be 0.
    l = list(map(lambda x: (x[0], 0) if x[0] == "" else x, zip(answers, pairs)))
    similarties = list(list(zip(*l))[1])
    similarties.pop(0)
    return similarties

In [None]:
import os
import pandas as pd


def get_the_list_of_files(path):
    """
    Get the list of files in the directory
    """
    files = []
    for dirpath, dirnames, filenames in os.walk(path):
        files.extend(filenames)
        break
    return sorted(files)


images = get_the_list_of_files(base_path_images)

# get max page from annotations_list
max_page = 0
for annotation in annotations_list:
    if annotation["page"] > max_page:
        max_page = annotation["page"]
max_page = max_page + (1 if max_page % 2 == 1 else max_page + 2) # Scanner will have a blank page!

# filter images by file name divided by page
images_by_page = []
for page in range(max_page):
    images_by_page.append([])
    for image in images:
        p = int(image.split(".")[0])
        if p % max_page == page:
            images_by_page[page].append(image)


def get_df(question):
    row = annotations_dict[question].copy()

    row["Confidence"] = 0.1
    row["Similarity"] = 0
    row["Image"] = images_by_page[row["page"]]
    # append base_path_images to each image
    row["Image"] = ["images/" + image for image in row["Image"]]

    # expend row to dataframe for each image in row["Image"]
    data = pd.DataFrame(row)
    data = data.explode("Image")
    data = data.reset_index(drop=True)

    data["Answer"] = data.apply(
        lambda row: ocr_image_from_file(
            base_path + "/" + row["Image"],
            row["left"],
            row["top"],
            row["width"],
            row["height"],
        ),
        axis=1,
    )
    # add column RowNumber
    data["RowNumber"] = data.index + 1
    data["maskPage"] = data["page"]

    similarties = calculate_similarity(data["Answer"].tolist(), question)

    data["Similarity"] = similarties

    data["page"] = data["Image"].apply(
        lambda x: x.replace("images/", "").replace(".jpg", "")
    )
    data["Mark"] = data["Answer"].apply(lambda x: "0" if len(x.strip()) == 0 else "")

    return data


def save_template_output(output, question, filename):
    path = Path(base_path_questions, question)
    path.mkdir(parents=True, exist_ok=True)
    path = Path(os.path.join(path, filename))
    text_file = open(path, "w")
    text_file.write(output)
    text_file.close()


# question = "NAME"
# get_df(question)

Generate individual question page.

In [None]:
from ipywidgets import IntProgress
from IPython.display import display

max_count = len(questions)
f = IntProgress(min=0, max=max_count) # instantiate the bar
display(f) # display the bar

for question in questions:
    dataTable = get_df(question)
    os.makedirs(base_path_questions + "/" + question, exist_ok=True)
    dataTable.to_csv(base_path_questions + "/" + question + "/data.csv", index=False)

    if question == "ID" or question == "NAME" or question == "CLASS":
        template = env.get_template("questions/index-answer.html")
    else:
        template = env.get_template("questions/index.html")
    output = template.render(
        studentsScriptFileName=file_name,
        question=question,
        standardAnswer=standard_answer[question] if question in standard_answer else "",
        estimatedBoundingBox=annotations_dict[question],
        dataTable=dataTable,
    )
    save_template_output(output, question, "index.html")

    template = env.get_template("questions/question.js")
    output = template.render(
        dataTable=dataTable,
        estimatedBoundingBox=annotations_dict[question],
    )
    save_template_output(output, question, "question.js")

    template = env.get_template("questions/style.css")
    output = template.render(
        dataTable=dataTable,
    )
    save_template_output(output, question, "style.css")
    f.value += 1

## Validate Student ID

In [None]:
# load csv file to dataframe
import pandas as pd

id_from_oscr = pd.read_csv(base_path_questions + "/" + "ID" + "/data.csv")["Answer"].tolist()
id_from_namelist = name_list_df["StudentID"].to_list()

# check duplicate id
duplicate_id = []
for id in id_from_oscr:
    if id_from_oscr.count(id) > 1:
        duplicate_id.append(id)
duplicate_id = list(set(duplicate_id))
if len(duplicate_id) > 0:
    print(colored("Duplicate ID: {}".format(duplicate_id), "red"))

id_from_oscr = [str(id) for id in id_from_oscr]
id_from_namelist = [str(id) for id in id_from_namelist]

# compare oscr_id and validate_id
ocr_missing_id = []
name_list_missing_id = []
for id in id_from_oscr:    
    if id not in id_from_namelist:       
        name_list_missing_id.append(id)

for id in id_from_namelist:
    if id not in id_from_oscr:   
        ocr_missing_id.append(id)

## OCR scan error case

In [17]:
from termcolor import colored
if len(ocr_missing_id) > 0:
    print(colored("Some IDs OCR is not in NameList and you need to fix it manually!", "red"))
    for id in name_list_missing_id:
        print(colored(id, "red"))

[31mSome IDs OCR is not in NameList and you need to fix it manually![0m
[31m230 22q /53[0m
[31mL7v (2u-)[0m
[31m3 00 7*771[0m
[31m230 IT0 [ 9 /[0m
[31m1 300443~|[0m
[31mID:230345870[0m
[31m23ool0 814[0m
[31m2300540 60[0m
[31mNarlc[0m
[31mLJUv 7 <k[0m
[31mV 7[0m
[31mFJU) ( 60>[0m
[31mL J U (t CLS 6[0m
[31mLau(Yo ( 0[0m
[31m"[ $ U ( |[0m
[31m150t60 | | ([0m
[31m2500 77675[0m
[31mYJu 0  6 S2 |[0m
[31m'232A8 (8 6[0m
[31m2 5 0 & / 0 > T3[0m
[31m0 [1 }[0m
[31m2303538s }[0m
[31m2z01tutT[0m
[31m2500 % 084 S[0m


## Potential Absent Case

In [18]:
from termcolor import colored

if len(ocr_missing_id) > 0:
    print(colored("Number of absentee {}.".format(len(ocr_missing_id)), "red"))
    print(colored("ID in Name List does not find from OCR!", "red"))
    for id in ocr_missing_id:
        print(colored(id, "red"))

[31mNumber of absentee 25.[0m
[31mID in Name List does not find from OCR![0m
[31m230108158[0m
[31m230318186[0m
[31m230034000[0m
[31m230070545[0m
[31m230066821[0m
[31m230166592[0m
[31m230090894[0m
[31m230098093[0m
[31m230108933[0m
[31m230151517[0m
[31m230345729[0m
[31m230074777[0m
[31m230353858[0m
[31m230042915[0m
[31m230241744[0m
[31m230077675[0m
[31m230229153[0m
[31m230160471[0m
[31m230345570[0m
[31m230110191[0m
[31m230047321[0m
[31m230045874[0m
[31m230142256[0m
[31m230034663[0m
[31m230128250[0m


# Start Python HTTPServer

The webserver log is in output/server.log.

If you are in development and don't want the notebook being blocked by running webserver, you can open a terminal and run the below command.

file_name=XXXX python server.py 8000

In [19]:
print("file_name={} python server.py 8000".format(file_name))

file_name=TestScript python server.py 8000


In [None]:
# You can also uncomment the following line to run the web server but if it crashes, you need to restart the kernel.
# !cd .. && file_name=TestScript python server.py 8000

# Post Processing after scoring
1. Check all question has scores.
2. Check ID again.

In [20]:
# check each sub folder of base_path_questions contains file name mark.json, ignore the root folder
import os
import json

unfinsihed_scoring = []
for path, currentDirectory, files in os.walk(base_path_questions):
    if path != base_path_questions:
        # extract question name from path
        question = path[len(base_path_questions) + 1 :]
        if "mark.json" not in files:
            unfinsihed_scoring.append(question)
        else:
            # read mark.json as json
            with open(os.path.join(path, "mark.json"), "r") as f:
                marks = json.load(f)            
            # check each mark in marks that attribute "mark" or "overridedMark" is not empty
            for mark in marks:
                if mark['mark'] == "" and  mark['overridedMark'] == "":
                    # extract question name from path                   
                    unfinsihed_scoring.append(question)
                    break             

if len(unfinsihed_scoring) > 0:            
    print(colored("{} have some question not yet mark!".format(unfinsihed_scoring), "red"))          
else:
    print("All questions have been marked!")

All questions have been marked!


Check ID

In [21]:
import os
import json

with open(os.path.join(base_path_questions,"ID", "mark.json"), "r") as f:
    marks = json.load(f)

id_from_mark = list(map(lambda x: x["overridedMark"] if x["overridedMark"] != "" else x["mark"], marks))
id_from_namelist = name_list_df["StudentID"].to_list()

# convert id_from_mark to string
id_from_mark = [str(id) for id in id_from_mark]
id_from_namelist = [str(id) for id in id_from_namelist]

mark_missing_id = []
for id in id_from_namelist:
    if id not in id_from_mark:   
        mark_missing_id.append(id)
print(colored("In class but not marked - {}!".format(mark_missing_id), "red"))    

marked_but_not_in_namelist = []
for id in id_from_mark:
    if id not in id_from_namelist:   
        marked_but_not_in_namelist.append(id)

print(colored("Marked ID but not in class - {}!".format(marked_but_not_in_namelist), "red"))

[31mIn class but not marked - ['230345729']![0m
[31mMarked ID but not in class - []![0m


### Remove version history
Before you backup.

In [None]:
## remove fill start with control- or mark- and end with .json in base_path_questions recursively.
import os
for path, currentDirectory, files in os.walk(base_path_questions):
    for file in files:
        if file.startswith("control-") or file.startswith("mark-"):
            os.remove(os.path.join(path, file))

### Reset everything (Danger)
Remove mark.js and control.js

In [None]:
# import os
# for path, currentDirectory, files in os.walk(base_path_questions):
    
#     for file in files:       
#         if file == "control.json" or file == "mark.json":
#             os.remove(os.path.join(path, file))