# HTML code generation from screenshots

This code is forked from the [Emil Wallner code](https://github.com/emilwallner/Screenshot-to-code-in-Keras/blob/master/local/HTML/HTML.ipynb) on Github on HTML code generation and upgraded with :
- Classical computer vision approach to detect the different elements on the screenshots
- Some hyper-parameter tuning
- A few printing functions to understand the inputs and outputs of each parts of the model.

## Init of Google colab environnement
This project has been run & trained on [Google colab](https://colab.research.google.com/notebooks/welcome.ipynb), allowing easy & free iterations on Tesla K80. The next few lines install the necessary librarires and mount  a google drive folder to enable this notebook to access other files (PNG screenshots and HTML files in our case).

In [0]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

!mkdir -p drive
!google-drive-ocamlfuse drive


## Installing libraries, checking and cding to the good directory in Google Drive

In [0]:
!apt-get install -y libsm6 libxext6
!pip uninstall -y opencv-python
!pip install -U opencv-contrib-python
import os
os.chdir("drive/ECP/DeepLearning/HTML")

## Imports

In [0]:
import cv2
from os import listdir
from numpy import array
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, load_model
from keras.utils import to_categorical
from keras.layers import Embedding, TimeDistributed, RepeatVector, LSTM, CuDNNLSTM, concatenate , Input, Reshape, Dense, Flatten
from keras.preprocessing.image import array_to_img, img_to_array, load_img
from keras.applications.inception_resnet_v2 import InceptionResNetV2, preprocess_input
from keras.callbacks import Callback
from IPython.display import clear_output
from nltk.translate.bleu_score import corpus_bleu
import numpy as np
from matplotlib import pyplot as plt
import copy
!ls

## Detecting features using InceptionResNet (trained on Imagenet)

In [0]:
# Load the images and preprocess them for inception-resnet
images = []
all_filenames = listdir('images/')
all_filenames.sort()
print(all_filenames)
for filename in all_filenames:
    images.append(img_to_array(load_img('images/'+filename, target_size=(299, 299))))
images = np.array(images, dtype=float)

In [0]:
images = preprocess_input(images)
# Run the images through inception-resnet and extract the features without the classification layer
IR2 = InceptionResNetV2(weights='imagenet', include_top=False)
features = IR2.predict(images)

# print(features.shape)
# plt.imshow(features[3][:,:,2])

In [0]:
# Load and image, preprocess it for IR2, extract features and generate the HTML
IMG_SIZE= 299
test_image = img_to_array(load_img('test/865.png', target_size=(IMG_SIZE, IMG_SIZE)))
test_image = np.array(test_image, dtype=float)
test_image = preprocess_input(test_image)
test_features = IR2.predict(np.array([test_image]))
test_image2 = img_to_array(load_img('test/86.png', target_size=(IMG_SIZE, IMG_SIZE)))
test_image2 = np.array(test_image, dtype=float)
test_image2 = preprocess_input(test_image2)
test_features2 = IR2.predict(np.array([test_image2]))

## Computer vision approach
Here we try different methods to process the images and detect elements of the screenshots, instead of using the heavy InceeptionResNet

### Loading images

In [0]:
images = []
IMG_SIZE = 128

all_filenames = listdir('images/')
all_filenames.sort()
for filename in all_filenames:
    # Here images are of size 512 x 512
    img = cv2.imread('images/' + filename) # possible flag : cv2.IMREAD_GRAYSCALE to load the image in grayscale
    images.append(img)

### Keypoints detection using SIFT
Be carefull, executing this code will induce error on the Keras model, since the Deep Learning Network should have the descriptors as input and not the features map.
This is left here only for reference.

In [0]:
# Compute the SIFT points
NB_DESCRIPTORS = 90
descriptors = []
kps = []
for img in images:
    sift = cv2.xfeatures2d.SIFT_create(contrastThreshold=0.01, edgeThreshold=40)
    kp, des = sift.detectAndCompute(img, None)
    print(des.shape)
    if des.shape[0] < NB_DESCRIPTORS:
        des = np.concatenate([des, np.zeros((NB_DESCRIPTORS - des.shape[0], 128))], axis=0)
    else:
        des = des[:NB_DESCRIPTORS]
    
    descriptors.append(des)
    kps.append(kp)
    
dummy = np.zeros((1,1))
descriptors = np.array(descriptors)
print(descriptors.shape)

# using a gray scale to output the tones of the elements in the screenshot
boundaries = [(i, i + 10) for i in range(0, 255, 10)]


features = []
for img in images:
  im_features = []
  kp = cv2.cornerHarris(img,2,3,0.04)
  im_features.append(kp)
  
  for (lower, upper) in boundaries:
    mask = cv2.inRange(img, lower, upper)
    output = cv2.bitwise_and(img, img, mask = mask)
    im_features.append(output)
  
  features.append(im_features)

features = np.array(features)
print(features.shape)

### Features detection using contours & downsampling

In [0]:
# partially inspired from https://docs.opencv.org/3.1.0/d3/db4/tutorial_py_watershed.html
def process_screenshot(img, low_thresh, high_thresh):

    # converting the image to grayscale, applying threshold to detect all the
    # elements of the screenshot (text, frames, links ...)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    _, thresh = cv2.threshold(gray, low_thresh, 255, cv2.THRESH_BINARY_INV)
    _2, thresh2 = cv2.threshold(gray, high_thresh, 255, cv2.THRESH_BINARY)
    thresholded = cv2.add(thresh, thresh2)
    thresholded = cv2.bitwise_not(thresholded)

    kernel = np.ones((3, 3), np.uint8)
    # dilating the foreground to know for sure which part of the image is in the
    # background
    sure_bg = cv2.dilate(thresholded, kernel, iterations=3)

    # Finding sure foreground area by reducing the "interfaces" between different
    # elements
    dist_transform = cv2.distanceTransform(thresholded, cv2.DIST_L2, 5)
    _3, sure_fg = cv2.threshold(dist_transform, 0.3 * dist_transform.max(), 255, 0)

    # Finding unknown region (usefull only for watershed algorithm)
    sure_fg = np.uint8(sure_fg)
    unknown = cv2.subtract(sure_bg,sure_fg)
    # Marker labelling
    _4, markers = cv2.connectedComponents(sure_fg)

    # Add one to all labels so that sure background is not 0, but 1
    markers = markers+1

    # Now, mark the region of unknown with zero, watershed the image and draw a
    # red line around the elements
    # NOTE : This is unused because not as efficient as expected
    # markers[unknown == 255] = 0
    # markers = cv2.watershed(img, markers)
    # img[markers == -1] = [255, 0, 0]

    # finding the contours of the elements of the screenshot
    # experience the best results for the sure foreground (so we will not use
    # watershed method)
    _5, contours, hierarchy = cv2.findContours(sure_fg, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)    

    detection_imgs = []
    rois = []  # regions of interest
    windows = []
    img2 = copy.copy(img)
    for contour in contours:
        [x, y, w, h] = cv2.boundingRect(contour)
        area = w * h

        # discard areas that are too small (probably noise)
        if h < 3 or w < 3:
            continue

        # drawing a pink rectangle around contour on original image
        img2 = cv2.rectangle(img2, (x, y), (x + w, y + h), (255, 0, 255), 1)
        roi = img2[y + 1:y + h, x + 1:x + w]

        # drawing only the detected element of the screenshot with a black
        # background
        window = np.zeros((512, 512, 3))
        window[y + 1:y + h, x + 1:x + w] = img2[y + 1:y + h, x + 1:x + w]

        # saving the resuls and resetting the original image for the next elements
        # to process
        detection_imgs.append(img2)
        rois.append(roi)
        windows.append((window, area))
        img2 = copy.copy(img)
    
    # we order the windows by area to ensure wa have the most important one first
    windows.sort(key=lambda tup: tup[1])
    windows = [tup[0] for tup in windows]

    # Here we downscale our images and keep the important features (localization
    # of the element in the screenshot and texture) by using a Gaussian
    # convolution
    # See https://docs.opencv.org/3.4.0/d4/d1f/tutorial_pyramids.html for more infos
    features = []  # 8x8x3 images
    for window in windows:
        feature = window
        while feature.shape[0] / 2 >= 8:
            feature = cv2.pyrDown(feature)
        features.append(feature)

    return detection_imgs, rois, windows, features


detection_imgs, rois, windows, features = process_screenshot(images[3], 51, 253)
INDEX_EL = 25
fig = plt.figure()
fig.add_subplot(2, 2, 1)
plt.imshow(rois[INDEX_EL])
fig.add_subplot(2, 2, 2)
plt.imshow(windows[INDEX_EL])
fig.add_subplot(2, 2, 3)
plt.imshow(features[INDEX_EL])
fig.add_subplot(2, 2, 4)
plt.imshow(detection_imgs[INDEX_EL])
plt.show()

LOW_THRESH = 51
HIGH_THRESH = 253
NB_ELEMENTS_WANTED = 100
features = []
_, _, _, feats1 = process_screenshot(images[0], 54, 221)
_, _, _, feats2 = process_screenshot(images[1], 53, 253)
_, _, _, feats3 = process_screenshot(images[2], 53, 253)
_, _, _, feats4 = process_screenshot(images[3], 51, 253)

feats1 = np.array(feats1)
feats2 = np.array(feats2)
feats3 = np.array(feats3)
feats4 = np.array(feats4)
print(feats1.shape)
print(feats2.shape)
print(feats3.shape)
print(feats4.shape)

features.append(feats1[:30])
features.append(feats2[:30])
features.append(feats3[:30])
features.append(feats4[:30])

features = np.array(features)
features = features.transpose((0, 2, 3, 4, 1))
features = features.reshape(4, 8, 8, -1)
features.shape


## Tokenizing text & preparing image features

In [0]:
# We will cap each input sequence to 100 tokens
max_caption_len = 100
# Initialize the function that will create our vocabulary 
tokenizer = Tokenizer(filters='', split=" ", lower=False)

# Read a document and return a string
def load_doc(filename):
    file = open(filename, 'r')
    text = file.read()
    file.close()
    return text

# Load all the HTML files
X = []
all_filenames = listdir('html/')
all_filenames.sort()
print(all_filenames)
for filename in all_filenames:
    X.append(load_doc('html/'+filename))

# Create the vocabulary from the html files
tokenizer.fit_on_texts(X)

# new_dict = {}
# for key in tokenizer.word_docs.keys():
#   new_key = key + '>'
#   new_dict[new_key] = tokenizer.word_docs[key]
# tokenizer.word_docs = new_dict

# Add +1 to leave space for empty words
vocab_size = len(tokenizer.word_index) + 1
# Translate each word in text file to the matching vocabulary index
sequences = tokenizer.texts_to_sequences(X)
# The longest HTML file
max_length = max(len(s) for s in sequences)
for s in sequences:
  print(len(s))
print(vocab_size)
print(max_length)
  
# Intialize our final input to the model
X, y, image_data = list(), list(), list()
for img_no, seq in enumerate(sequences):
    for i in range(1, len(seq)):
        # Add the entire sequence to the input and only keep the next word for the output
        in_seq, out_seq = seq[:i], seq[i]
        # If the sentence is shorter than max_length, fill it up with empty words
        in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
        # Map the output to one-hot encoding
        out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
        # Add and image corresponding to the HTML file
        image_data.append(features[img_no])
        # Cut the input sentence to 100 tokens, and add it to the input data
        X.append(in_seq[-max_caption_len:])
        y.append(out_seq)

X, y, image_data = np.array(X), np.array(y), np.array(image_data)



## Model definition

In [0]:
# Create the encoder
image_features = Input(shape=(8, 8, 1536,))
image_flat = Flatten()(image_features)
image_flat = Dense(128, activation='relu')(image_flat)
ir2_out = RepeatVector(max_caption_len)(image_flat)

language_input = Input(shape=(max_caption_len,))
language_model = Embedding(vocab_size, 200, input_length=max_caption_len)(language_input)
language_model = CuDNNLSTM(256, return_sequences=True)(language_model)
language_model = CuDNNLSTM(256, return_sequences=True)(language_model)
language_model = TimeDistributed(Dense(128, activation='relu', name='context_features'))(language_model)

# Create the decoder
decoder = concatenate([ir2_out, language_model])
decoder = CuDNNLSTM(512, return_sequences=False)(decoder)
decoder_output = Dense(vocab_size, activation='softmax')(decoder)

# Compile the model
model = Model(inputs=[image_features, language_input], outputs=decoder_output)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')

## Util function 

In [0]:
# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [0]:
# generate a description for an image
def generate_desc(model, tokenizer, photo, max_length, verbose=1):
    # seed the generation process
    in_text = 'START'
    # iterate over the whole length of the sequence
    for i in range(900):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0][-100:]
        # pad input
        sequence = pad_sequences([sequence], maxlen=max_length)
        # predict next word
        yhat = model.predict([photo,sequence], verbose=0)
        # convert probability to integer
        yhat = np.argmax(yhat)
        # map integer to word
        word = word_for_id(yhat, tokenizer)
        # stop if we cannot map the word
        if word is None:
            print("NONE")
            break
        # append as input for generating the next word
        in_text += ' ' + word
        # Print the prediction
        if verbose > 0:
          print(' ' + word, end='')
        # stop if we predict the end of the sequence
        if word == 'END':
            break
    return in_text

In [0]:
# loads the html file, tokenize it and put each token in the return list
def load_text(filepath):
    text = []
    whole_text = load_doc(filepath)
    # Add text and wrap it in a start and end tag
    syntax = '<START> ' + whole_text + ' <END>'
#     syntax = whole_text
    # Seperate each word with a space (like the tokenizer)
    syntax = ' '.join(syntax.split())
    # Add a space between each comma
    syntax = syntax.replace(',', ' ,')
    text.append(syntax)
    return text

# Evaluate model
def evaluate_model(model, texts, photo, tokenizer, max_length, verbose=1):
    actual, predicted = list(), list()
    # step over the whole set
    for i in range(len(texts)):
        yhat = generate_desc(model, tokenizer, photo, max_length, verbose)
        # store actual and predicted
        if verbose >0:
          print('\n\nReal---->\n\n' + texts[i])
        actual.append([texts[i].split()])
        predicted.append(yhat.split())
    # calculate BLEU score
    bleu = corpus_bleu(actual, predicted)
    return bleu, actual, predicted


In [0]:
#"""Draws a graph of the loss of each epoch. This is very usefull to know when your overfitting (when testing the NN for each x epoch)"""
BLEU_STEP = 25
class PlotLossesBinary(Callback):
    def on_train_begin(self, logs={}):
        self.i = 0
        self.x = []
        self.losses = []
        self.bleu_train = []
        self.bleu_test =[]
        
        self.index = 0
        self.max = 0 
        
        self.logs = []

    def on_epoch_end(self, epoch, logs={}):
        
        self.logs.append(logs)
        self.x.append(self.i)
        self.losses.append(logs.get('loss'))
        if self.i % BLEU_STEP == 0:
          texts = load_text('test/865.html')
          bleu, actual, predicted = evaluate_model(self.model, texts, np.array(test_features), tokenizer, max_caption_len, verbose=0)
          self.bleu_test.append(bleu)
          
          texts = load_text('test/86.html')
          bleu, actual, predicted = evaluate_model(self.model, texts, np.array(test_features2), tokenizer, max_caption_len, verbose=0)
          self.bleu_train.append(bleu)
          print(bleu)
        
        clear_output(wait=True)
        
        plt.subplot(221)
        plt.plot(self.x, self.losses, label="Training loss")
        plt.legend()
        
        plt.subplot(222)
        plt.plot([el for el in self.x if el % BLEU_STEP == 0], self.bleu_train, label="BLEU train score")
        plt.plot([el for el in self.x if el % BLEU_STEP == 0], self.bleu_test, label="BLEU test score")
        plt.legend()
        
        plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.25,
                    wspace=0.35)
        plt.show();
        self.i += 1

plot_losses = PlotLossesBinary()

## Training the model

In [0]:
# Train the neural network
# model = load_model('model.h5')
model.fit([image_data, X], y, batch_size=64, shuffle=False, epochs=50, callbacks=[plot_losses], verbose=2)

In [0]:
model.save('model.h5')

## Generating the HTML code from the trained model & the test image

In [0]:
test_image = cv2.imread('test/865.png') # possible flag : cv2.IMREAD_GRAYSCALE to load the image in grayscale
_, _, _, test_features = process_screenshot(test_image, 75, 253)
test_features = np.array(test_features[:30])
test_features = test_features.transpose((1, 2, 3, 0))
test_features = test_features.reshape(8, 8, -1)
print(test_features.shape)
generate_desc(model, tokenizer, np.array([test_features]), max_caption_len)

In [0]:
# Load and image, preprocess it for IR2, extract features and generate the HTML
test_image = img_to_array(load_img('test/865.png', target_size=(299, 299)))
test_image = np.array(test_image, dtype=float)
test_image = preprocess_input(test_image)
test_features = IR2.predict(np.array([test_image]))
generate_desc(model, tokenizer, np.array(test_features), 100)

In [0]:
texts = load_text('test/865.html')
bleu, actual, predicted = evaluate_model(model, texts, np.array(test_features), tokenizer, max_caption_len)
print(bleu)


In [0]:
# Bonus : here we can generate the HTML markup features to see what they look like, by getting the intermediate model
def generate_language_features(model, tokenizer, max_length):
    # seed the generation process
    in_text = 'START'
    # iterate over the whole length of the sequence
    for i in range(900):
        # integer encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0][-max_caption_len:]
        # pad input
       
        sequence = pad_sequences([sequence], maxlen=max_length)

        context_features = model.predict([sequence], verbose=0)
        print("features", context_features)
    return

intermediate_model = Model(inputs=language_input, outputs=language_model)
generate_language_features(intermediate_model, tokenizer, 100)
