    To do:
    - Use pickle to store the training data.
    - Pass text blocks to Tesseract.
    - Pass photographs to NeuralTalk2.

# Generate GeM annotation

## 1. Import the necessary packages.

In [None]:
# For computer vision
import cv2
import mahotas
from imutils import paths

# For machine learning
from __future__ import print_function
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

# For logging
import logging
from logging import FileHandler
from vlogging import VisualRecord

# For connected-component analysis
from skimage.filters import threshold_adaptive
from skimage import measure

# For connected-component analysis
import numpy as np

# For encoding files
import codecs

Open the logging file and set the attributes.

In [None]:
logger = logging.getLogger("detect_elements")
fh = FileHandler("detect_and_classify_elements_log.html", mode = "w")

logger.setLevel(logging.DEBUG)
logger.addHandler(fh)

# Prevent logger output in IPython
logger.propagate = False

# Define a function to handle visual logging
def vlog(image, title):
    logger.debug(VisualRecord(title, image, fmt = "png"))

## 2. Define functions

In [None]:
def describe(image):
    (means, stds) = cv2.meanStdDev(cv2.cvtColor(image, cv2.COLOR_BGR2HSV))
    colorStats = np.concatenate([means, stds]).flatten()
    
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    haralick = mahotas.features.haralick(gray).mean(axis = 0)
    
    return np.hstack([colorStats, haralick])

## 3. Train the classifier

Describe the images.

In [None]:
imagePaths = sorted(paths.list_images('training_data/'))
labels = []
data = []

for imagePath in imagePaths:
    label = imagePath[imagePath.rfind('/') + 1:].split('_')[0]
    image = cv2.imread(imagePath)
    
    features = describe(image)
    labels.append(label)
    data.append(features)

Split the data into training and testing data.

In [None]:
(trainData, testData, trainLabels, testLabels) = train_test_split(np.array(data), np.array(labels), test_size = 0.25, random_state = 42)

Set up a random forest classifier.

In [None]:
model = RandomForestClassifier(n_estimators = 20, random_state = 42)

Train the classifier.

In [None]:
model.fit(trainData, trainLabels)

Print out a report on accuracy.

In [None]:
predictions = model.predict(testData)
print(classification_report(testLabels, predictions))

## 4. Prepare the document image.

Load the image.

In [None]:
image = cv2.imread('test_images/2005-hwy-side_b-5.jpg')

# Logging
logger.debug("Image width: {}, height: {}".format(image.shape[1], image.shape[0]))
vlog(image, "Original image")

Convert image to grayscale.

In [None]:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

vlog(gray, "Grayscale")

Apply bilateral filtering to remove detail but preserve edges.

In [None]:
params = (11, 41, 21)
blurred = cv2.bilateralFilter(gray, params[0], params[1], params[2])

# Logging
logger.debug("Parameters for bilateral filtering: diameter of the pixel neighbourhood: {}, standard deviation for color: {}, standard deviation for space: {}".format(params[0], params[1], params[2]))
vlog(blurred, "Bilaterally filtered")

Define a kernel size for morphological operations.

> The kernel size must be determined after deciding input image resolution. It should be based on type size and correspond roughly to the x-height of the font face used for body text.

In [None]:
kernelsize = (11, 13)

Perform Otsu's thresholding.

In [None]:
(T, thresholded) = cv2.threshold(blurred, 0, 255, cv2.THRESH_OTSU)

# Logging
logger.debug("Otsu's threshold: {}".format(T))
vlog(thresholded, "Thresholded")

## 5. Perform morphological operations on the document image.

In [None]:
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, kernelsize)

#### Morphological gradient 

In [None]:
gradient = cv2.morphologyEx(thresholded.copy(), cv2.MORPH_GRADIENT, kernel)

# Logging
logger.debug("Kernel size: {}".format(kernelsize))
vlog(gradient, "Morphological gradient applied")

#### Erode

In [None]:
eroded = cv2.erode(gradient, None, iterations = 2)

# Logging
vlog(eroded, "Morphological gradient eroded")

## 6. Perform connected-components labeling

Perform connected component labeling and set up a mask for the labels to be kept.

In [None]:
labels = measure.label(eroded, neighbors = 8, background = 0)

gradient_mask = np.zeros(gradient.shape, dtype = "uint8")

Loop over the labels twice:
    1. Calculate the average number of pixels per label.
    2. Decide which labels to include in the mask.

In [None]:
# First loop

numpixels_all = []

for (i, label) in enumerate(np.unique(labels)):
    if label == -1:
        continue
    labelmask = np.zeros(gradient.shape, dtype = "uint8")
    labelmask[labels == label] = 255
    numpixels = cv2.countNonZero(labelmask)
    numpixels_all.append(numpixels)

average = sum(numpixels_all) / len(numpixels_all)

In [None]:
# Second loop

for (i, label) in enumerate(np.unique(labels)):
    if label == -1:
        continue
    labelmask = np.zeros(gradient.shape, dtype = "uint8")
    labelmask[labels == label] = 255
    numpixels = cv2.countNonZero(labelmask)
    
    if numpixels > (int(average) * 0.05):
        gradient_mask = cv2.add(gradient_mask, labelmask)
        
# Logging
logger.debug("Average size for label: {}".format(average))   
vlog(gradient_mask, "Mask for morphological gradient after connected-components labeling")

## 7. Find and classify contours in the processed image

Find contours in the image after applying morphological gradient and performing connected-components labeling.

In [None]:
(contours, hierarchy) = cv2.findContours(gradient_mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

Set up another mask.

In [None]:
contour_mask = np.zeros(gradient_mask.shape, dtype = "uint8")

Draw contours on the mask.

In [None]:
for c in contours:
    (x, y, w, h) = cv2.boundingRect(c)
    cv2.rectangle(contour_mask, (x, y), (x + w, y + h), (255, 255, 255), -1)

# Logging
vlog(contour_mask, "Contour mask")

Detect and classify contours in the mask and draw them on the original image.

In [None]:
(maskcontours, maskhierarchy) = cv2.findContours(contour_mask.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

Open the XML file.

In [None]:
xmlfile = codecs.open('layout-1.xml', 'w', 'utf-8')

preamble = '<?xml version="1.0" encoding="UTF-8"?>\n\n'

Write preamble.

In [None]:
xmlfile.write(preamble)

In [None]:
original = image.copy()

oh = original.shape[0]
ow = original.shape[1]

segmentation = []
area_model = []
realization = []

for num, mc in enumerate(maskcontours):
    (x, y, w, h) = cv2.boundingRect(mc)
    if h <= (0.9 * oh):
        bounding_box = original[y:y+h, x:x+w]
        features = describe(bounding_box)
        prediction = model.predict(features)[0]
        if prediction == 'text':
            # Draw rectange on original image
            cv2.rectangle(original, (x, y), (x + w, y + h), (0, 0, 255), 1)
            # Describe layout unit
            textual_layout_unit = '\t\t<layout-unit id="lay-1.' + str(num + 1) + '"/>\n'
            # Describe sub-area
            textual_sub_area = '\t\t<sub-area id="sa-1.' + str(num + 1) + '" ' + 'startx="' + str(x) + '" ' + 'starty="' + str(y) + '" ' + 'endx="' + str(x + w) + '" ' + 'endy="' + str(y + h) + '"' + '/>\n'
            # Describe realization
            textual_realization = '\t\t<realization xref="lay-1.' + str(num + 1) + '" type="text"/>\n'
            # Append descriptions to list
            segmentation.append(textual_layout_unit)
            area_model.append(textual_sub_area)
            realization.append(textual_realization)
        if prediction == 'photo':
            # Draw rectange on original image
            cv2.rectangle(original, (x, y), (x + w, y + h), (0, 255, 0), 1)
            # Describe layout unit
            visual_layout_unit = '\t\t<layout-unit id="lay-1.' + str(num + 1) + '" alt="Photo"/>\n'
            # Describe sub-area
            visual_sub_area = '\t\t<sub-area id="sa-1.' + str(num + 1) + '" ' + 'startx="' + str(x) + '" ' + 'starty="' + str(y) + '" ' + 'endx="' + str(x + w) + '" ' + 'endy="' + str(y + h) + '"' + '/>\n'  
            # Describe realization
            visual_realization = '\t\t<realization xref="lay-1.' + str(num + 1) + '" type="photo" width="' + str(w) + 'px" height="' + str(h) + 'px"/>\n'
            # Append descriptions to list
            segmentation.append(visual_layout_unit)
            area_model.append(visual_sub_area)
            realization.append(visual_realization)
            
vlog(original, "RESULT 1: Contours detected in the contour mask")

## 8. Generate the GeM XML file

Generate annotation for layout layer segmentation.

In [None]:
segmentation_preamble = '\t<segmentation>\n'

xmlfile.write("".join(segmentation_preamble))

for s in segmentation:
    xmlfile.write("".join(s))
    
segmentation_terminate = '\t</segmentation>\n'

xmlfile.write("".join(segmentation_terminate))

Generate annotation for area model.

In [None]:
areamodel_preamble = '\t<area-model>\n'

xmlfile.write("".join(areamodel_preamble))

for a in area_model:
    xmlfile.write("".join(a))
    
areamodel_terminate = '\t</area-model>\n'

xmlfile.write("".join(areamodel_terminate))

Generate annotation for realization information.

In [None]:
realization_preamble = '\t<realization>\n'

xmlfile.write("".join(realization_preamble))

for r in realization:
    xmlfile.write("".join(r))
    
realization_terminate = '\t</realization>\n'

xmlfile.write("".join(realization_terminate))

In [None]:
xmlfile.close()