In [None]:
%matplotlib inline
from matplotlib import pyplot

In [None]:
import os
from datetime import datetime
import subprocess
import shutil
import argparse
import itertools
from tempfile import mkdtemp
from xml.dom import minidom
from functools import lru_cache

import cv2
import zbarlight
import numpy as np
import pandas
import yaml
from PIL import Image

from pony.orm import db_session
import db

YAML_VERSION = 0

In [None]:
import zbar  # From zbar-py, better than zbarlight because it returns coordinates!

In [None]:
os.chdir('/home/ton/Bot/current_work/grader_app/test1_data/')
images = os.listdir()

In [None]:
@lru_cache()
def studentnr_data(dpi):
    """Prepare data for extracting the student number.

    Returns
    =======
    template : array
        A properly scaled image of the widget with source drawing.
    centers : array of int
        Sorted coordinates of the centers of all circles from the matrix.
    r : int
        Radius of the circles.

    """
    doc = minidom.parse('number_widget.svg')
    centers = np.array([(float(path.getAttribute('cx')),
                         float(path.getAttribute('cy')))
                        for path in doc.getElementsByTagName('ellipse')])
    centers = centers[np.lexsort(centers.T)]
    centers = np.rint((dpi * 100/90 / 90) * centers).astype(int)
    r = next(iter(doc.getElementsByTagName('ellipse'))).getAttribute('rx')
    r = int(float(r) * (dpi * 100/90 / 90))

    subprocess.run(['convert', '-density', str(dpi * 100//90), 'number_widget.svg',
                    'number_widget.png'])
    template = cv2.imread('number_widget.png')
    os.remove('number_widget.png')
    return template, centers, r

In [None]:
def pdf_to_images(filename, loc=None):
    """Extract all images out of a pdf file."""
    loc = loc or os.getcwd()
    # For some reason still writes the images in the same folder as the original file :(
    subprocess.run(['pdfimages', '-png', filename, os.path.join(loc, filename[:-len('.pdf')])])
    return os.listdir(loc)

pdf_to_images('../scans_test1.pdf')

In [None]:
def read_yaml(filename):
    with open(filename) as f:
        exam_data = yaml.load(f)

    if exam_data['protocol_version'] != YAML_VERSION:
        raise RuntimeError('Only v{} supported'.format(YAML_VERSION))
    db_file = exam_data['name'] + '.sqlite'
    widgets = pandas.DataFrame(exam_data['widgets']).T
    widgets.index.name = 'name'
    qr = widgets[widgets.index.str.contains('qrcode')]
    widgets = widgets[~widgets.index.str.contains('qrcode')]
    return exam_data['name'], qr, widgets

name, qr, widgets = read_yaml('../minitest1.yml')

In [None]:
def guess_dpi(image_array):
    h, w, *_ = image_array.shape
    dpi = np.round(np.array([h, w]) / (297, 210) * 25.4, -1)
    if dpi[0] != dpi[1]:
        raise ValueError("The image doesn't appear to be A4.")
    return dpi[0]

In [None]:
def get_box(image_array, box, padding):
    """Extract a subblock from an array corresponding to a scanned A4 page.
    
    Parameters:
    -----------
    image_array : 2D or 3D array
        The image source.
    box : 4 floats (top, bottom, left, right)
        Coordinates of the bounding box in inches. By due to differing traditions,
        box coordinates are counted from the bottom left of the image, while image
        array coordinates are from the top left.
    padding : float
        Padding around box borders in inches.
    """
    h, w, *_ = image_array.shape
    dpi = guess_dpi(image_array)
    box = np.array(box)
    box += (padding, -padding, -padding, padding)
    box = (np.array(box) * dpi).astype(int)
    top, bottom = min(h, box[0]), max(0, box[1])
    left, right = max(0, box[2]), min(w, box[3])
    return image_array[-top:-bottom, left:right]

In [None]:
def extract_qr(image_path, scale_factor=4):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)[::scale_factor, ::scale_factor]
    transpose = image.shape[0] < image.shape[1]
    if transpose:
        image = image.T
    for threshold in (200, 150, 220):  # Because zbar is picky.
        thresholded = 255 * (image > threshold)
        scanner = zbar.Scanner()
        results = scanner.scan(thresholded.astype(np.uint8))
        if results:
            return results[0]
    else:
        raise RuntimeError("Couldn't extract qr code from "
                           + image_path)

extracted_qrs = [extract_qr(image) for image in os.listdir()]

In [None]:
def rotate_and_get_offest(image_path, extracted_qr, scale_factor=4):
    image = cv2.imread(image_path)

    if image.shape[0] < image.shape[1]:
        image = np.transpose(image, (1, 0, 2))

    dpi = guess_dpi(image)
    h, w, *_ = image.shape
    page = int(extracted_qr.data.decode().split(';')[2])
    box = dpi * qr[qr.page == page][['top', 'bottom', 'left', 'right']].values[0]
    y0, x0 = h - np.mean(box[:2]), np.mean(box[2:])
    x, y = np.mean(scale_factor * np.array(extracted_qr.position), axis=0)
    if (x > w / 2) != (x0 > w / 2):
        image = image[::-1]
        x = w - x
    if (y > h / 2) != (y0 > h / 2):
        image = image[:, ::-1]
        y = h - y

    cv2.imwrite(image_path, image)
    return x-x0, y-y0

rotate_and_get_offest('teeeest.png', qrs[19])

In [None]:
for image_path, extracted_qr in zip(os.listdir(), extracted_qrs):

# So far so good

In [None]:
pyplot.imshow(image[179*2:293*2, 1018*2:1133*2])

In [None]:
qrs[0].position

In [None]:
len(qrs)

In [None]:
def extract_qr_and_rotate(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    if image.shape[0] < image.shape[1]:
        image = np.transpose(image, [1, 0])
    for threshold in (200, 150, 220):
        thresholded = 255 * (image > threshold)
        x, y = image.shape[0] // 2, image.shape[1] // 2
        for direction in itertools.product([1, -1], [1, -1]):
            rotated = thresholded[::direction[0], ::direction[1]]
            candidate = get_box(rotated, qr_box, .7)
            pyplot.figure()
            pyplot.imsave('test.png', candidate, cmap='gray')
            break
            code = zbarlight.scan_codes('qrcode', Image.fromarray(candidate))
            if code is not None:
                return(code)
    else:
        raise RuntimeError("Couldn't extract qr code from "
                           + image_path)

extract_qr_and_rotate('.-000.ppm')

In [None]:
qr_box = (max(qr.top), min(qr.bottom), min(qr.left), max(qr.right))

pyplot.imshow(get_box(image, qr_box, .5))

In [None]:
def init_db(students='students.csv', graders='graders.csv',
         meta_yaml='test_exam/tussentoets2016-6.yml',
         scanned_pdf='test_exam/sample_data.pdf', overwrite=False):
    with open(meta_yaml) as f:
        exam_data = yaml.load(f)

    if exam_data['protocol_version'] != YAML_VERSION:
        raise RuntimeError('Only v0 supported')
    db_file = exam_data['name'] + '.sqlite'
    widgets = pandas.DataFrame(exam_data['widgets']).T
    widgets.index.name = 'name'
    widgets = widgets[~widgets.index.str.contains('qrcode')]

    # Measured offset of coordinates between original and scanned for 300 dpi.
    # Should eventually avoid, and use e.g. cv2.matchTemplate
    delta = np.array([200, 50, -40, 10])
    result_version = 'v' + str(YAML_VERSION)
    tmp = mkdtemp()
    try:
        subprocess.run(['pdfimages', scanned_pdf, tmp + '/'])
        images = os.listdir(tmp)
        for image_path in images:
            image_path = os.path.join(tmp, image_path)
            image = cv2.imread(image_path)
            dpi = int(round(image.shape[0] / 11.6929, -1))
            # zbar is picky about contrast, so we threshold the image.
            # To go sure we try several values of the threshold.
            # Can we do anything better?
            # TODO: don't be lazy and only feed the correct corner to zbar
            for threshold in (200, 150, 220):
                _, thresholded = cv2.threshold(image, threshold, 255,
                                               cv2.THRESH_BINARY)
                code = zbarlight.scan_codes('qrcode',
                                            Image.fromarray(thresholded))
                if code is not None:
                    break
            else:
                raise RuntimeError("Couldn't extract qr code from "
                                   + image_path)
            version, name, page, copy = code[0].decode().split(';')
            if version != result_version:
                raise RuntimeError('Unknown test version,'
                                   ' only {} supported'.format(result_version))
            if name != exam_data['name']:
                raise RuntimeError('yaml and pdf are from different exams')
            target = name + '_data'
            os.makedirs(target, exist_ok=True)

            coords =  (dpi * widgets[widgets.page == int(page)]).astype(int)
            delta_dpi = (delta * dpi / 300).astype(int)
            for widget in coords.itertuples():
                widget_data = image[-widget.top - delta_dpi[0]
                                    : -widget.bottom - delta_dpi[1],
                                    widget.left + delta_dpi[2]
                                    : widget.right + delta_dpi[3]]
                filename = os.path.join(target, widget.Index + copy + '.png')
                cv2.imwrite(filename, widget_data)
                if widget.Index == 'studentnr':
                    # TODO: Hook up the student number to the database.
                    possible_student_nr = extract_number(widget_data, dpi)
                    print(possible_student_nr)
                    with db_session:
                        sub = Submission.get(id=copy) or Submission(id=copy)
                        sub.student = Student.get(id=possible_student_nr)
                        sub.signature_image_path = filename
                else:
                    with db_session:
                        sub = Submission.get(id=copy) or Submission(id=copy)
                        Solution(problem=Problem.get(name=widget.Index),
                                 image_path=filename, submission=sub)

    finally:
        shutil.rmtree(tmp, ignore_errors=True)

In [None]:
def extract_number(image, dpi):
    template, centers, r = studentnr_data(dpi)
    w, h = template.shape[:2][::-1]
    res = cv2.matchTemplate(image, template, cv2.TM_CCOEFF)
    *_, top_left = cv2.minMaxLoc(res)
    bottom_right = (top_left[0] + w, top_left[1] + h)
    img = image[top_left[1] : bottom_right[1],
              top_left[0] : bottom_right[0]]
    results = []
    for center in centers:
        results.append(np.sum(255 - img[max(center[1] - r, 0)
                                        : center[1] + r,
                                        max(center[0] - r, 0)
                                        : center[0] + r]))
    results = np.array(results)
    digits = (np.argmax(results.reshape(7, 10), axis=1) + 1) % 10
    return int(''.join(map(str, digits)))