# Preprocessing Student Answer
OCR all student's answer

In [1]:
pdf_file = "../data/TestScript.pdf"

In [2]:
import os

file_name = os.path.basename(pdf_file)
file_name = os.path.splitext(file_name)[0]
base_path = "../marking_form/" + file_name
base_path_images = base_path + "/images/"
base_path_annotations = base_path+"/annotations/"
base_path_questions = base_path+"/grading_form/questions"
base_path_javascript = base_path+"/grading_form/javascript"

# create directory tree for base_path_images
os.makedirs(base_path_images, exist_ok=True)
os.makedirs(base_path_annotations, exist_ok=True)
os.makedirs(base_path_questions, exist_ok=True)
os.makedirs(base_path_javascript, exist_ok=True)

In [16]:
import json
annotations_path = base_path_annotations + "annotations.json"
with open(annotations_path, "r") as f: 
    annotations = json.load(f) 

annotations

{'0': [{'x': 238, 'y': 173, 'width': 322, 'height': 77, 'label': 'NAME'},
  {'x': 175, 'y': 259, 'width': 471, 'height': 83, 'label': 'ID'},
  {'x': 835, 'y': 248, 'width': 168, 'height': 91, 'label': 'CLASS'},
  {'x': 164, 'y': 344, 'width': 322, 'height': 111, 'label': 'Q1'}],
 '1': [{'x': 184, 'y': 482, 'width': 347, 'height': 138, 'label': 'Q36'},
  {'x': 665, 'y': 460, 'width': 410, 'height': 150, 'label': 'Q37'},
  {'x': 1166, 'y': 473, 'width': 384, 'height': 143, 'label': 'Q38'}]}

In [17]:
#flatten annotations to list 
annotations_list = []
for page in annotations:
    for annotation in annotations[page]:
        annotation["page"] = int(page)
        annotations_list.append(annotation) 
annotations_list

# convert annotations_list to dict with key with label
annotations_dict = {}
for annotation in annotations_list:
    annotations_dict[annotation["label"]] = annotation
annotations_dict


{'NAME': {'x': 238,
  'y': 173,
  'width': 322,
  'height': 77,
  'label': 'NAME',
  'page': 0},
 'ID': {'x': 175,
  'y': 259,
  'width': 471,
  'height': 83,
  'label': 'ID',
  'page': 0},
 'CLASS': {'x': 835,
  'y': 248,
  'width': 168,
  'height': 91,
  'label': 'CLASS',
  'page': 0},
 'Q1': {'x': 164,
  'y': 344,
  'width': 322,
  'height': 111,
  'label': 'Q1',
  'page': 0},
 'Q36': {'x': 184,
  'y': 482,
  'width': 347,
  'height': 138,
  'label': 'Q36',
  'page': 1},
 'Q37': {'x': 665,
  'y': 460,
  'width': 410,
  'height': 150,
  'label': 'Q37',
  'page': 1},
 'Q38': {'x': 1166,
  'y': 473,
  'width': 384,
  'height': 143,
  'label': 'Q38',
  'page': 1}}

In [13]:
# extract list of label from annotations as questions
questions = []
for annotation in annotations_list:
    if annotation["label"] not in questions:
        questions.append(annotation["label"])
# remove 'NAME', 'ID', 'CLASS' if exists in questions
if 'NAME' in questions:
    questions.remove('NAME')
if 'ID' in questions:
    questions.remove('ID')
if 'CLASS' in questions:
    questions.remove('CLASS')    

# sort questions 
questions.sort()
questions = ['NAME', 'ID', 'CLASS'] + questions
questions

['NAME', 'ID', 'CLASS', 'Q1', 'Q36', 'Q37', 'Q38']

Check for the regeneration of question.

In [6]:
import os
import json

questionAndControl = {}
for path, currentDirectory, files in os.walk(base_path_questions):
    for file in files:
        if file == "control.json":
            question = path[len(base_path_questions) + 1 :]
            f = open(os.path.join(path, file))
            data = json.load(f)
            if "regenerate" in data:
                questionAndControl[question] = data
            f.close()

questionAndControl

{}

In [13]:
from distutils.dir_util import copy_tree
import shutil
import os

from_directory = os.path.join(os.getcwd(), "..","templates", "javascript")
copy_tree(from_directory, base_path_javascript)
ico = os.path.join(os.getcwd(), "..","templates", "favicon.ico")
# copy ico file  to base_path
shutil.copyfile(ico, base_path+"/favicon.ico")

'../marking_form/TestScript/favicon.ico'

Generate the index.html

In [34]:
from pathlib import Path
from jinja2 import Environment, FileSystemLoader

file_loader = FileSystemLoader("../templates")
env = Environment(loader=file_loader)
template = env.get_template("index.html")

output = template.render(
    studentsScriptFileName=file_name,
    textAnswer=questions,
    optionAnswer=[],
)
# open text file
path = Path(os.path.join(base_path, "index.html"))
text_file = open(path, "w")
text_file.write(output)
text_file.close()

In [46]:
import os
import pandas as pd


def get_the_list_of_files(path):
    """
    Get the list of files in the directory
    """
    files = []
    for dirpath, dirnames, filenames in os.walk(path):
        files.extend(filenames)
        break
    return sorted(files)


images = get_the_list_of_files(base_path_images)

# get max page from annotations_list
max_page = 0
for annotation in annotations_list:
    if annotation["page"] > max_page:
        max_page = annotation["page"]
max_page = max_page + 1

# filter images by file name divided by page
images_by_page = []
for page in range(max_page):
    images_by_page.append([])
    for image in images:
        if str(page) in image:
            images_by_page[page].append(image)


def get_df(question):
    row = annotations_dict[question].copy()
    # rename x to left, y to top
    row["left"] = row.pop("x")
    row["top"] = row.pop("y")
    row["Answer"] = ""
    row["Confidence"] = 0.1
    row["Similarity"] = 0
    row["Image"] = images_by_page[row["page"]]
    # append base_path_images to each image
    row["Image"] = ["images/" + image for image in row["Image"]]
    # expend row to dataframe for each image in row["Image"]
    row = pd.DataFrame(row)
    row = row.explode("Image")
    row = row.reset_index(drop=True)

    return row


def save_template_output(output, question, filename):
    path = Path(base_path_questions, question)
    path.mkdir(parents=True, exist_ok=True)
    path = Path(os.path.join(path, filename))
    text_file = open(path, "w")
    text_file.write(output)
    text_file.close()


question = "NAME"
get_df(question)

Unnamed: 0,width,height,label,page,left,top,Answer,Confidence,Similarity,Image
0,322,77,NAME,0,238,173,,0.1,0,images/0.jpg
1,322,77,NAME,0,238,173,,0.1,0,images/10.jpg
2,322,77,NAME,0,238,173,,0.1,0,images/100.jpg
3,322,77,NAME,0,238,173,,0.1,0,images/101.jpg
4,322,77,NAME,0,238,173,,0.1,0,images/102.jpg
5,322,77,NAME,0,238,173,,0.1,0,images/103.jpg
6,322,77,NAME,0,238,173,,0.1,0,images/104.jpg
7,322,77,NAME,0,238,173,,0.1,0,images/105.jpg
8,322,77,NAME,0,238,173,,0.1,0,images/106.jpg
9,322,77,NAME,0,238,173,,0.1,0,images/107.jpg


Generate individual question page.

In [52]:

for question in questions:

    dataTable = get_df(question)

    # optionsDf = estimatedBoundBoxesOptions[
    #     estimatedBoundBoxesOptions.index.str.fullmatch(question)
    # ]
    # if not optionsDf.empty:  # skip choice questions
    #     continue
    # dataTable = get_question_df(question)

    # standardAnswer = None
    # estimatedBoundingBox = None
    # if question in df_answer["DocumentValueOverrided"].columns:
    #     standardAnswer = questionToAnswer[question]
    # if question in df_script["DocumentBoundingBox"].columns:
    #     estimatedBoundingBox = estimatedBoundBoxes.loc[[question]]["boundingBox"][0]

    if question == "ID" or question == "Name":
        template = env.get_template("questions/index-answer.html")
    else:
        template = env.get_template("questions/index.html")
    output = template.render(
        studentsScriptFileName=file_name,
        question=question,
        standardAnswer="",
        estimatedBoundingBox=annotations_dict[question],
        dataTable=dataTable,
    )
    save_template_output(output, question, "index.html")

    template = env.get_template("questions/question.js")
    output = template.render(
        dataTable=dataTable,
        estimatedBoundingBox=annotations_dict[question],
    )
    save_template_output(output, question, "question.js")

    template = env.get_template("questions/style.css")
    output = template.render(
        dataTable=dataTable,
    )
    save_template_output(output, question, "style.css")

# Start Python HTTPServer

The webserver log is in output/server.log.

If you are in development and don't want the notebook being blocked by running webserver, you can open a terminal and run the below 2 commands.

python server.py 8000