In [42]:
from lxml import etree
test_file = 'data3/0a961053-DESKTOP-svg-clean.svg'
# Open file
with open(test_file, 'r') as file:
    data = file.read()

In [43]:
from collections import Counter

def data_to_pattern(data):
    # Repeat the pattern "A " as many times as it's needed to match the data length.
    pattern = "A " * (len(data) // 2)
    if len(data) % 2 == 1:
        pattern += "A"
    return pattern

class CollectorTarget(object):
    def __init__(self):
        self.tags = Counter()
        self.attributes = Counter()
        self.attribute_values = list()
        self.data_values = list()
    
    def start(self, tag, attrib):
        if '{' == tag[0]:
            tag = tag.split('}')[-1]
        self.tags.update([tag])
        self.attributes.update(attrib.keys())
        
        for key, value in attrib.items():
            if not self._is_image(key, value):     
                self.attribute_values.append(value)
    def end(self, tag):
        pass
    def data(self, data):
        pass
        # self.data_values.append(data_to_pattern(data))
    def comment(self, text):
        pass
    def close(self):
        pass
    def _is_image(self, attrib_key, attrib_value):
        if  '.' in attrib_value:
            extension = attrib_value.split('.')[-1]
            if '?' in extension:
                extension = extension.split('?')[0]
            return extension.lower() in ['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp']

parser = etree.XMLParser(target = CollectorTarget())

result = etree.XML('<br />', parser)

In [44]:
from lxml.etree import XMLSyntaxError
import glob

files = glob.glob('data3/*')
all_files = [file for file in files if file.endswith('svg-clean.svg')]

files = all_files[:100]
target = CollectorTarget()

for file in files:
    with open(file, 'r') as f:
        data = f.read()
        try:
            parser = etree.XMLParser(target = target)
            result = etree.XML(data, parser)
        except XMLSyntaxError as e:
            print(file)
            print(e)
        
# print(target.tags.most_common())
# print(target.attributes.most_common())
# print(target.attribute_values[:100])

In [45]:
from tokenizers.implementations import ByteLevelBPETokenizer

text_tokenizer = ByteLevelBPETokenizer()
text_tokenizer.train_from_iterator([target.attribute_values + target.data_values], vocab_size=5000, min_frequency=2)






In [46]:
text_tokenizer.encode("childStackingContextsWithStackLevelZeroAndPositionedDescendantsWithStackLevelZero")

Encoding(num_tokens=1, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [53]:
class TokenizerTarget(object):
    def __init__(self, common_tags, common_attributes, text_tokenizer):
        # Todo: use faster data types here.
        self.common_tags = common_tags
        self.common_attributes = common_attributes
        self.text_tokenizer = text_tokenizer
        self.tokenized_image_urls = list()
        self.current_tokens = list()
        self.last_tokens = list()
        self.special_tokens = ['close tag', 'data starts']
        self.data_tokens = [pow(2, i) for i in range(30, -1, -1)]
        
        self.offsets = {}
        self.offsets['special_tokens'] = 0
        self.offsets['data_tokens'] = self.offsets['special_tokens'] + len(self.special_tokens)
        self.offsets['common_tags'] = self.offsets['special_tokens'] + len(self.data_tokens)
        self.offsets['common_attributes'] = self.offsets['common_tags'] + len(self.common_tags)
        self.offsets['text_tokens'] = self.offsets['common_attributes'] + len(self.common_attributes)
        self.offsets['image_urls'] = self.offsets['text_tokens'] + text_tokenizer.get_vocab_size()
    
    def start(self, tag, attrib):
        if '{' == tag[0]:
            tag = tag.split('}')[-1]
        self._tokenize_tag(tag)
        
        for key, value in attrib.items():
            if self._is_image(key, value):
                self._tokenize_image_attribute(key, value)
            else:
                self._tokenize_attribute(key, value)
    def end(self, tag):
        self.current_tokens.append(self.offsets['special_tokens'] + self.special_tokens.index('close tag'))
        pass
    def data(self, data):
        self.current_tokens.append(self.offsets['special_tokens'] + self.special_tokens.index('data starts'))
        self._tokenize_text(data)
        # self._tokenize_data(data)
        pass
    def comment(self, text):
        pass
    def close(self):
        self.last_tokens = self.current_tokens
        self.current_tokens = list()

    def _is_image(self, attrib_key, attrib_value):
        if  '.' in attrib_value:
            extension = attrib_value.split('.')[-1]
            if '?' in extension:
                extension = extension.split('?')[0]
            return extension.lower() in ['jpg', 'jpeg', 'png', 'gif', 'svg', 'webp']

    def _tokenize_tag(self, tag):
        if tag not in self.common_tags:
            raise Exception('Tag not in common tags: ' + tag)
        self.current_tokens.append(self.offsets['common_tags'] + self.common_tags.index(tag))

    def _tokenize_text(self, text):
        encoded_text = self.text_tokenizer.encode(text).ids
        self.current_tokens.extend([self.offsets['text_tokens'] + token for token in encoded_text])

    def _tokenize_attribute(self, key, value):
        if key not in self.common_attributes:
            raise Exception('Attribute not in common attributes: ' + key)
        self.current_tokens.append(self.offsets['common_attributes'] + self.common_attributes.index(key))
        self._tokenize_text(value)

    def _tokenize_image_attribute(self, key, value):
        if key not in self.common_attributes:
            raise Exception('Image attribute not in common attributes: ' + key)

        self.current_tokens.append(self.offsets['common_attributes'] + self.common_attributes.index(key))
        
        if value not in self.tokenized_image_urls:
            self.tokenized_image_urls.append(value)
        self.current_tokens.append(self.offsets['image_urls'] + self.tokenized_image_urls.index(value))

    def _tokenize_data(self, data):
        data_length = len(data)
        for token, token_length in enumerate(self.data_tokens):
            if data_length <= token_length:
                self.current_tokens.append(token)
                data_length -= token_length
                if data_length == 0:
                    break
                    
                    
    # ----------------
    
    def decode(self, tokens):
        decoded = []
        open_tags = list()

        class ParsingMode:
            IN_TAG = 0
            ATTRIBUTE_STARTED = 1
            IN_ATTRIBUTE = 2
            DATA = 3
            
        current_parsing_mode = ParsingMode.DATA
        
        for token in tokens:
            if token >= self.offsets['image_urls']:
                decoded.append(self._decode_image_url_token(token))
            elif token >= self.offsets['text_tokens']:
                if current_parsing_mode == ParsingMode.ATTRIBUTE_STARTED:
                    decoded.append('="')
                    current_parsing_mode = ParsingMode.IN_ATTRIBUTE
                decoded.append(self._decode_text_token(token))
            elif token >= self.offsets['common_attributes']:
                if current_parsing_mode == ParsingMode.ATTRIBUTE_STARTED:
                    decoded.append('="" ')
                if current_parsing_mode == ParsingMode.IN_ATTRIBUTE:
                    decoded.append('" ')
                if current_parsing_mode == ParsingMode.IN_TAG:
                    decoded.append(' ')
                current_parsing_mode = ParsingMode.ATTRIBUTE_STARTED
                decoded.append(self._decode_attribute_token(token))
            elif token >= self.offsets['common_tags']:
                if current_parsing_mode == ParsingMode.ATTRIBUTE_STARTED:
                    decoded.append('="">')
                if current_parsing_mode == ParsingMode.IN_ATTRIBUTE:
                    decoded.append('">')
                if current_parsing_mode == ParsingMode.IN_TAG:
                    decoded.append('>')
                current_tag = self._decode_tag_token(token)
                open_tags.append(current_tag)
                decoded.append('<' + current_tag)
                current_parsing_mode = ParsingMode.IN_TAG
            elif token >= self.offsets['data_tokens']:
                decoded.append(self._decode_data_token(token))
            else:
                special_token = self._decode_special_token(token)
                if special_token == 'close tag':
                    if current_parsing_mode != ParsingMode.DATA:
                        if current_parsing_mode == ParsingMode.ATTRIBUTE_STARTED:
                            decoded.append('=""')
                        if current_parsing_mode == ParsingMode.IN_ATTRIBUTE:
                            decoded.append('"')
                        decoded.append('>')
                    current_tag = open_tags.pop()
                    decoded.append('</' + current_tag + '>')
                    current_parsing_mode = ParsingMode.DATA
                if special_token == 'data starts':
                    if current_parsing_mode == ParsingMode.ATTRIBUTE_STARTED:
                        decoded.append('=""')
                    if current_parsing_mode == ParsingMode.IN_ATTRIBUTE:
                        decoded.append('"')
                    decoded.append('>')
                    current_parsing_mode = ParsingMode.DATA
        return "".join(decoded)

    def _decode_special_token(self, token):
        return self.special_tokens[token - self.offsets['special_tokens']]

    def _decode_data_token(self, token):
        return 'data'

    def _decode_tag_token(self, token):
        return self.common_tags[token - self.offsets['common_tags']]

    def _decode_attribute_token(self, token):
        return self.common_attributes[token - self.offsets['common_attributes']]

    def _decode_text_token(self, token):
        return self.text_tokenizer.decode([token - self.offsets['text_tokens']])

    def _decode_image_url_token(self, token):
        return self.tokenized_image_urls[token - self.offsets['image_urls']]



In [54]:
test_file = 'data3/0a961053-DESKTOP-svg-clean.svg'

tokenizer_target = TokenizerTarget(
    [item for item, count in target.tags.most_common(500)], 
    [item for item, count in target.attributes.most_common(5000)],
    text_tokenizer
)

# Open file
with open(test_file, 'r') as file:
    data = file.read()

    try:
        parser = etree.XMLParser(target = tokenizer_target)
        result = etree.XML(data, parser)
        
        print(len(tokenizer_target.last_tokens))
        print(len(data))
    except XMLSyntaxError as e:
        print(file)
        print(e)

5926
33023


In [57]:
parser = etree.XMLParser(target = tokenizer_target)
etree.XML('<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="1440" height="1148" viewBox="0 0 1440 1148"><text color="rgb(255, 255, 255)" dominant-baseline="text-after-edge" font-family="Icons" font-size="14.4px" font-stretch="100%" font-style="normal" font-variant="normal" font-weight="400" direction="ltr" letter-spacing="normal" text-decoration="none solid rgb(255, 255, 255)" text-anchor="start" text-rendering="auto" unicode-bidi="normal" word-spacing="0px" writing-mode="horizontal-tb" user-select="none" fill="rgb(255, 255, 255)">Hello</text><text></text><tspan>AAA</tspan><tspan/></svg>', parser)

tokens = tokenizer_target.last_tokens
print(tokens)

[43, 65, 1764, 66, 478, 671, 109, 162, 435, 4791, 462, 1956, 34, 67, 424, 154, 460, 158, 509, 158, 509, 155, 68, 441, 159, 544, 159, 532, 69, 3720, 70, 518, 160, 166, 452, 71, 521, 151, 72, 431, 73, 431, 74, 633, 75, 549, 76, 431, 77, 517, 552, 551, 154, 460, 158, 509, 158, 509, 155, 78, 542, 79, 411, 80, 431, 81, 162, 452, 82, 547, 159, 533, 83, 517, 60, 424, 154, 460, 158, 509, 158, 509, 155, 1, 186, 413, 466, 0, 34, 0, 32, 1, 4487, 0, 32, 0, 0]


In [58]:
decoded = tokenizer_target.decode(tokens)
print(decoded)

<svg width="1440" height="1148" viewBox="0 0 1440 1148"><text color="rgb(255, 255, 255)" dominant-baseline="text-after-edge" font-family="Icons" font-size="14.4px" font-stretch="100%" font-style="normal" font-variant="normal" font-weight="400" direction="ltr" letter-spacing="normal" text-decoration="none solid rgb(255, 255, 255)" text-anchor="start" text-rendering="auto" unicode-bidi="normal" word-spacing="0px" writing-mode="horizontal-tb" user-select="none" fill="rgb(255, 255, 255)">Hello</text><text></text><tspan>AAA</tspan><tspan></tspan></svg>


In [65]:
import cssutils

style = cssutils.parseString('a { color: rgb(255, 255, 255); }')
print(style.cssRules[0].selectorText)
print(style.cssRules[0].selectorText)

a
