### Util

#### Hot encode and decode

In [149]:
import numpy as np
from collections import OrderedDict
from operator import itemgetter


def myHotEncode(input_data, max_vocab=0, vocab2idx=None):
    "Return the hot-vecotor and the vocab2idx."
    if vocab2idx is None:
        vocabFreq = {}
        for i in input_data:
            for j in i:
                if j not in vocabFreq:
                    vocabFreq[j] = 0
                vocabFreq[j] += 1
        vocabFreq = OrderedDict(sorted(vocabFreq.items(), key=itemgetter(1), reverse=True))
        vocab2idx = {}
        count = 0
        for v in vocabFreq:
            count += 1
            if max_vocab > 0 and count > max_vocab:
                break
            vocab2idx[v] = len(vocab2idx)        
    vocabEmbeddings = np.identity(len(vocab2idx), dtype='float32')
    data_ret = []
    for i in input_data:
        i_ = []
        for j in i:
            if j in vocab2idx:
                i_.append(vocabEmbeddings[vocab2idx[j]])
        data_ret.append(sum(i_))
    if len(data_ret) == 0:
        data_ret = np.zeros(len(vocab2idx))
    return data_ret, vocab2idx


def myHotDecode(input_data, vocab2idx):
    "Return the decode as final representation and decode as indexs"
    data_ = []
    data_idx = []
    for i in input_data:
        i_ = []
        i_idx = []
        if len(i) != len(vocab2idx):
            print('Erro:', 'The vocab2idx not fit the input data!')
            return
        for _i, j in enumerate(i):
            if j > 0:
                v = [k for k in vocab2idx if vocab2idx[k]==_i][0]
                i_.append(v)
                i_idx.append(_i)
        data_.append(i_)
        data_idx.append(i_idx)
    return data_, data_idx

#### Treat a list of str

In [31]:
import re


def treat_str_list(input_list):
    ret = []
    for s in input_list:
        ret += [ss.lower().strip() for ss in re.split('[\W_]+', s)]
    return ret

### Load data

In [136]:
import pandas as pd


data_path = 'features_annotated.csv'
df = pd.read_csv(data_path)

In [137]:
import ast


columns = ['attrs', 'label', 'level', 'tag', 'text', 'url']

last_url = ''
batches = []

css_tss = []
id_tss = []
level_tss = []
tag_tss = []
css_ts = []
id_ts = []
level_ts = []
tag_ts = []

labels = []
labels_batch = []
label = 'None'

last_level = -1

for attr, label_, level, tag, text, url in df[columns].values:
    attr = ast.literal_eval(attr)
    if url != last_url:
        css_tss.append(css_ts.copy())
        id_tss.append(id_ts.copy())
        level_tss.append(level_ts.copy())
        tag_tss.append(tag_ts.copy())
        css_ts = []
        id_ts = []
        level_ts = []
        tag_ts = []
        labels_batch.append(label)
        aux_list = []
        aux_list.append(css_tss.copy())
        aux_list.append(id_tss.copy())
        aux_list.append(level_tss.copy())
        aux_list.append(tag_tss.copy())
        batches.append(aux_list)
        css_tss = []
        id_tss = []
        level_tss = []
        tag_tss = []
        labels.append(labels_batch.copy())
        labels_batch = []
        last_url = url
        label = 'None'
    if label_ != 'None':
        label = label_
    if not level > last_level:
        css_tss.append(css_ts.copy())
        id_tss.append(id_ts.copy())
        level_tss.append(level_ts.copy())
        tag_tss.append(tag_ts.copy())
        css_ts = css_ts[:level]
        id_ts = id_ts[:level]
        level_ts = level_ts[:level]
        tag_ts = tag_ts[:level]
        labels_batch.append(label)
    
    # Features
    if 'class' in attr:
        css_ts.append(treat_str_list(attr['class']))
        #css_ts += treat_str_list(attr['class'])
    else:
        css_ts.append([])
    aux_list = []
    if 'id' in attr:
        #aux_list.append(treat_str_list([attr['id']]))
        aux_list += treat_str_list([attr['id']])
    if 'name' in attr:
        #aux_list.append(treat_str_list([attr['name']]))
        aux_list += treat_str_list([attr['name']])
        
    id_ts.append(aux_list)
    tag_ts.append([tag])
    level_ts.append([level])
    last_level = level

css_tss.append(css_ts.copy())
id_tss.append(id_ts.copy())
level_tss.append(level_ts.copy())
tag_tss.append(tag_ts.copy())
labels_batch.append(label)
aux_list = []
aux_list.append(css_tss.copy())
aux_list.append(id_tss.copy())
aux_list.append(level_tss.copy())
aux_list.append(tag_tss.copy())
batches.append(aux_list)
labels.append(labels_batch.copy())
batches = batches[1:]
labels = labels[1:]

### Mapping the features

In [138]:
css_ = []
id_ = []
tag_ = []
for batch in batches:
    for css in batch[0]:
        for i in css:
            css_.append(i)
    for _id in batch[1]:
        for i in _id:
            id_.append(i)
    for tag in batch[3]:
        for i in tag:
            tag_.append(i)
_, css2Idx = myHotEncode(css_, 100)
_, id2Idx = myHotEncode(id_, 100)
_, tag2Idx = myHotEncode(tag_, 100)

In [139]:
for i0, batch in enumerate(batches):
    # CSS classes
    for i1, sample in enumerate(batch[0]):
        for i2, timestep in enumerate(sample):
            timestep_, _ = myHotEncode(timestep, vocab2idx=css2Idx)
            batches[i0][0][i1][i2] = timestep_
    # ID and Names
    for i1, sample in enumerate(batch[1]):
        for i2, timestep in enumerate(sample):
            timestep_, _ = myHotEncode(timestep, vocab2idx=id2Idx)
            batches[i0][1][i1][i2] = timestep_
    # Tags
    for i1, sample in enumerate(batch[3]):
        for i2, timestep in enumerate(sample):
            timestep_, _ = myHotEncode(timestep, vocab2idx=tag2Idx)
            batches[i0][3][i1][i2] = timestep_

In [148]:
batches[0][0][0]

[[], [], [], []]