### Util

#### Hot encode and decode

In [1]:
import numpy as np
from collections import OrderedDict
from operator import itemgetter


def myHotEncode(input_data, max_vocab=0, vocab2idx=None):
    "Return the hot-vecotor and the vocab2idx."
    if vocab2idx is None:
        vocabFreq = {}
        for i in input_data:
            for j in i:
                if j not in vocabFreq:
                    vocabFreq[j] = 0
                vocabFreq[j] += 1
        vocabFreq = OrderedDict(sorted(vocabFreq.items(), key=itemgetter(1), reverse=True))
        vocab2idx = {}
        count = 0
        for v in vocabFreq:
            count += 1
            if max_vocab > 0 and count > max_vocab:
                break
            vocab2idx[v] = len(vocab2idx)        
    vocabEmbeddings = np.identity(len(vocab2idx), dtype='float32')
    data_ret = []
    for i in input_data:
        i_ = []
        for j in i:
            if j in vocab2idx:
                i_.append(vocabEmbeddings[vocab2idx[j]])
        data_ret.append(sum(i_))
    if len(data_ret) == 0:
        data_ret = np.zeros(len(vocab2idx))
    return data_ret, vocab2idx


def myHotDecode(input_data, vocab2idx):
    "Return the decode as final representation and decode as indexs"
    data_ = []
    data_idx = []
    for i in input_data:
        i_ = []
        i_idx = []
        if len(i) != len(vocab2idx):
            print('Erro:', 'The vocab2idx not fit the input data!')
            return
        for _i, j in enumerate(i):
            if j > 0:
                v = [k for k in vocab2idx if vocab2idx[k]==_i][0]
                i_.append(v)
                i_idx.append(_i)
        data_.append(i_)
        data_idx.append(i_idx)
    return data_, data_idx

#### Treat a list of str

In [2]:
import re


def treat_str_list(input_list):
    ret = []
    for s in input_list:
        ret += [ss.lower().strip() for ss in re.split('[\W_]+', s)]
    return ret

### Load data

In [11]:
import pandas as pd


data_path = 'features_annotated.csv'
df = pd.read_csv(data_path)

In [43]:
import ast


columns = ['attrs', 'label', 'level', 'tag', 'text', 'url']

last_url = ''
#batches = []

css_tss = []
id_tss = []
level_tss = []
tag_tss = []
url_tss = []
css_ts = []
id_ts = []
level_ts = []
tag_ts = []
url_ts = []

labels = []
#labels_batch = []
label = 'None'

last_level = -1

for attr, label_, level, tag, text, url in df[columns].values:
    attr = ast.literal_eval(attr)
    if url != last_url:
        css_tss.append(css_ts.copy())
        id_tss.append(id_ts.copy())
        level_tss.append(level_ts.copy())
        tag_tss.append(tag_ts.copy())
        css_ts = []
        id_ts = []
        level_ts = []
        tag_ts = []
        #labels_batch.append(label)
        #aux_list = []
        #aux_list.append(css_tss.copy())
        #aux_list.append(id_tss.copy())
        #aux_list.append(level_tss.copy())
        #aux_list.append(tag_tss.copy())
        #batches.append(aux_list)
        #css_tss = []
        #id_tss = []
        #level_tss = []
        #tag_tss = []
        labels.append(label) # new
        #labels.append(labels_batch.copy())
        #labels_batch = []
        last_url = url
        label = 'None'
    if not level > last_level:
        css_tss.append(css_ts.copy())
        id_tss.append(id_ts.copy())
        level_tss.append(level_ts.copy())
        tag_tss.append(tag_ts.copy())
        css_ts = css_ts[:level]
        id_ts = id_ts[:level]
        level_ts = level_ts[:level]
        tag_ts = tag_ts[:level]
        #labels_batch.append(label)
        labels.append(label) # new
        label = 'None'
    if label_ != 'None':
        label = label_
    # Features
    if 'class' in attr:
        css_ts.append(treat_str_list(attr['class']))
        #css_ts += treat_str_list(attr['class'])
    else:
        css_ts.append([])
    aux_list = []
    if 'id' in attr:
        #aux_list.append(treat_str_list([attr['id']]))
        aux_list += treat_str_list([attr['id']])
    if 'name' in attr:
        #aux_list.append(treat_str_list([attr['name']]))
        aux_list += treat_str_list([attr['name']])
        
    id_ts.append(aux_list)
    tag_ts.append([tag])
    level_ts.append([level])
    last_level = level

css_tss.append(css_ts.copy())
id_tss.append(id_ts.copy())
level_tss.append(level_ts.copy())
tag_tss.append(tag_ts.copy())
#labels_batch.append(label)
#aux_list = []
#aux_list.append(css_tss.copy())
#aux_list.append(id_tss.copy())
#aux_list.append(level_tss.copy())
#aux_list.append(tag_tss.copy())
#batches.append(aux_list)
#labels.append(labels_batch.copy())
labels.append(label) # new
#batches = batches[1:]
css_tss = css_tss[1:] # new
id_tss = id_tss[1:] # new
level_tss = level_tss[1:] # new
tag_tss = tag_tss[1:] # new
url_tss = url_tss[1:] # new
labels = labels[1:]

### Mapping the features

In [44]:
css_ = []
id_ = []
tag_ = []
for css in css_tss:
    for i in css:
        css_.append(i)
for _id in id_tss:
    for i in _id:
        id_.append(i)
for tag in tag_tss:
    for i in tag:
        tag_.append(i)
_, css2Idx = myHotEncode(css_, 100)
_, id2Idx = myHotEncode(id_, 100)
_, tag2Idx = myHotEncode(tag_, 100)

### Mapping the output

In [45]:
_, label2Idx = myHotEncode([[l] for l in labels])

In [47]:
#for i0, batch in enumerate(batches):
# CSS classes
for i1, sample in enumerate(css_tss):
    for i2, timestep in enumerate(sample):
        timestep_, _ = myHotEncode(timestep, vocab2idx=css2Idx)
        css_tss[i1][i2] = timestep_
# ID and Names
for i1, sample in enumerate(id_tss):
    for i2, timestep in enumerate(sample):
        timestep_, _ = myHotEncode(timestep, vocab2idx=id2Idx)
        id_tss[i1][i2] = timestep_
# Tags
for i1, sample in enumerate(tag_tss):
    for i2, timestep in enumerate(sample):
        timestep_, _ = myHotEncode(timestep, vocab2idx=tag2Idx)
        tag_tss[i1][i2] = timestep_

## Model

In [None]:
from keras.models import Model, Sequential
from keras.layers import TimeDistributed,Conv1D,Dense,Embedding,Input,Dropout,LSTM,Bidirectional,MaxPooling1D,Flatten,concatenate,Concatenate


def lstm_model():
    input_css = Input(shape=(None, len(css2Idx)), name='input_css')
    css = Dense(len(css2Idx))(input_css)
    
    input_id = Input(shape=(None, len(id2Idx)), name='input_id')
    _id = Dense(len(id2Idx))(input_id)
    
    input_level = Input(shape=(None, 1), name='input_level')
    level = Dense(1)(input_level)
    
    input_tag = Input(shape=(None, len(tag2Idx)), name='input_tag')
    tag = Dense(len(tag2Idx))(input_tag)
    
    output = concatenate([css, _id, tag])
    output = LSTM(256)(output)
    output = Dense(len(label2Idx), activation='softmax')
    model = Model(inputs=[input_css, input_id, input_level, input_tag], outputs=[output])
    model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
    model.summary()
    return model

## Test

In [None]:
from sklearn.model_selection import GroupKFold
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix


accuracy = []
precision = []
recall = []
f1 = []
confusion = []

gp = GroupKFold(n_splits=5)
for train_indexs, test_indexs in gp.split(features['tags'], groups=features['groups']):
    
    #_trains = {}
    #for i in train_indexs:
    #    group_ = features['groups'][i]
    #    tag_ = features['tags'][i]
    #    level_ = features['levels'][i]
    #    label_ = features['labels'][i]
    #    if group_ not in _trains:
    #        _trains[group_] = [[], [], []]
    #    _trains[group_][0].append(tag_)
    #    _trains[group_][1].append(level_)
    #    _trains[group_][2].append(label_)

    #X_train = [[np.array(_trains[x][0]), np.array(_trains[x][1])] for x in _trains]
    #y_train = [np.array(_trains[y][-1]) for y in _trains]
    
    
    tags_train = features['tags'][train_indexs]
    tags_train = tags_train.reshape((tags_train.shape[0], 1, tags_train.shape[1])) # Test
    levels_train = features['levels'][train_indexs]
    levels_train = levels_train.reshape((levels_train.shape[0], 1, levels_train.shape[1])) # Test
    X_train = [tags_train, levels_train]
    y_train = features['labels'][train_indexs]
    
    tags_test = features['tags'][test_indexs]
    tags_test = tags_test.reshape((tags_test.shape[0], 1, tags_test.shape[1])) # Test
    levels_test = features['levels'][test_indexs]
    levels_test = levels_test.reshape((levels_test.shape[0], 1, levels_test.shape[1])) # Test
    X_test = [tags_test, levels_test]
    y_test = features['labels'][test_indexs]

    groups_train = features['groups'][train_indexs]
    groups_test = features['groups'][test_indexs]
    train_g = set()
    test_g = set()
    for g in groups_train:
        train_g.add(g)
    for g in groups_test:
        test_g.add(g)
    print('Train Groups (URLs)', train_g)
    print('Test Groups (URLs)', test_g)
    
    #model = KerasClassifier(build_fn=lstm_model)
    model.fit(X_train, y_train, epochs=5, verbose=0)
    #model.fit(tags_train, y_train, epochs=5, verbose=0)

    result = model.predict(X_test)
    #result = model.predict(tags_test)
    y_test = [r.tolist().index(1) for r in y_test]
    acc = accuracy_score(result, y_test)
    accuracy.append(acc)
    p = precision_score(result, y_test, average="macro")
    precision.append(p)
    r = recall_score(result, y_test, average="macro")
    recall.append(r)
    f = f1_score(result, y_test, average="macro")
    f1.append(f)
    confusion.append(confusion_matrix(result, y_test))
    
    print("%s: %.2f %%" % ('Acc', acc*100))
    print("%s: %.2f %%" % ('Precision', p*100))
    print("%s: %.2f %%" % ('Recall', r*100))
    print("%s: %.2f %%" % ('F1', f*100))
    print('')

print("Acc %.2f %% (+/- %.2f %%)" % (np.mean(accuracy)*100, np.std(accuracy)*100))
print("Precision %.2f %% (+/- %.2f %%)" % (np.mean(precision)*100, np.std(precision)*100))
print("Recall %.2f %% (+/- %.2f %%)" % (np.mean(recall)*100, np.std(recall)*100))
print("F1 %.2f %% (+/- %.2f %%)" % (np.mean(f1)*100, np.std(f1)*100))