In [198]:
import re
import html
import yaml

In [199]:
def split_post_into_block_tokens(post_text, start_pattern, end_pattern):
    
    token_groups = [
            [
                {"token_type" : "bbcode", "value" : e }
                if i%2 ==0
                else {"token_type" : "finish", "value" : e}
                for i, e in enumerate(
                    re.split(end_pattern, item)
                )
            ]
            if index%2 == 0
            else [{"token_type" : "start", "value" : item}]
            for index, item in enumerate(
                re.split(start_pattern, post_text)
            )
        ]
    
    tokens = []
    for g in token_groups:
        tokens.extend(g)
    
    return tokens

In [200]:
def text_is_empty(txt):
    if txt == None:
        return True
    return re.search('^[ \n\t]*$', txt) != None

In [201]:
def clean_up_text(txt):
    txt = re.sub('\t', ' ', txt) # remove tabs
    txt = re.sub('  +', ' ', txt) # deduplicate spaces
    txt = re.sub(' +\n', '\n', txt) # no spaces before a new line - it is a "space" in an of its own).
    txt = re.sub('\n +', '\n', txt) # ditto, for spaces after a new line
    txt = re.sub('\n\n+', '\n\n', txt) # more than one new line is at max. 2 new lines
    txt = re.sub('^\n', '', txt) # do not start with an empy line
    txt = re.sub('\n$', '', txt) # do end start with an empy line
    return txt

In [202]:
def clean_token_list(token_list):
    result = [
        item
        for item in token_list
        if (
            item['token_type'] != 'bbcode'
            or not text_is_empty(item['value'])
        )
    ]
    return result

In [203]:
def build_block_tree(tokens):

    block_tree = {
        'branch_type' : 'root',
        'elements' : []
    }
    curr_branch = block_tree
    breadcrumbs = []

    for pos in range(0,len(tokens)):

        token = tokens[pos]
        token_type = token['token_type']
        token_value = token['value']

        # stay on the same tree level
        if token_type == 'bbcode':

            leaf = {
                "branch_type" : token_type,
                "value" : token_value,
                "elements" : []
            }

            curr_branch['elements'].append(leaf)

        # embed a lower level
        if token_type == 'start':

            item_parts =  token_value.split('=')
            item_type = item_parts[0]
            item_val = ''
            if len(item_parts) > 1:
                item_val = item_parts[1]

            leaf = {
                "branch_type" : item_type,
                "value" : item_val,
                "elements" : []
            }

            curr_branch['elements'].append(leaf)
            breadcrumbs.append(curr_branch)
            curr_branch = leaf
            

        # roll-up to a higher layer
        if token['token_type'] == 'finish':

            # check if last opening and current closing tags match
            containing_branch_tag = curr_branch['branch_type']
            if containing_branch_tag == token_value:
                curr_branch = breadcrumbs.pop()

    return block_tree

In [204]:
with open("post.txt", 'r') as f:
    post_text = f.read()
    post_text = html.unescape(post_text)

bb_uid = '17hlnlju'

In [205]:
block_tags = 'quote|code|list|\*'
start_pattern = f'\[((?:{block_tags}).*?)\:{bb_uid}\]'
end_pattern = f'\[\/({block_tags})(?:\:m|\:u|\:o)?\:{bb_uid}\]'

In [206]:
block_tokens = split_post_into_block_tokens(post_text, start_pattern, end_pattern)
clean_block_tokens =  clean_token_list(block_tokens)

In [281]:
block_tree = build_block_tree(clean_block_tokens)

In [208]:
text = '''
aaa
[img:17hlnlju]https&#58;//v&#46;wpimg&#46;pl/MDc0OC5wYiUCUjlwGgxvMEEKbSpcVWFmFhJ1YRpFfHwbAXxzGgQqMxMeODNWEyNqE1xgIgdHdCZWBXl1DUYvJloELnZTRSl9U1N3IQVDfHRUBHdtRRkqZh4[/img:17hlnlju]
bbb
[youtube:17hlnlju]iBxVRwynmmE[/youtube:17hlnlju]
ccc
[url:17hlnlju]http&#58;//www&#46;wp&#46;pl[/url:17hlnlju]
ddd
[url=http&#58;//www&#46;wp&#46;pl:17hlnlju]wu pe pe el[/url:17hlnlju]
eee
'''

In [285]:
item_tags = 'url|img|youtube'
item_instance_pattern = f'(\[({item_tags})=?[^\]]*\:{bb_uid}\].*?\[/\\2\:{bb_uid}\]\n)'
item_parts_pattern = f'\[({item_tags})=?([^\]]*)\:{bb_uid}\](.*?)\[/\\1\:{bb_uid}\](\n)'

In [286]:
def parse_bb_block_item(text, pattern):

    result = {
            'branch_type' : None,
            'value' : None
        }

    elements = re.findall(pattern, text)
    if len(elements) > 0 :
        el = list(elements[0])
        if len(el) > 0:
            result = {
                'branch_type' : el[0],
                'value' : el[1:]
            }
        
    return result

In [287]:
def tokenize_phpbb_block(text, split_pattern, parse_pattern):

    text = clean_up_text(text)

    tokens = re.split(split_pattern, text)
    tokens = [html.unescape(t) for t in tokens]
    
    result = [
        {
            'branch_type' : 'bb_code_text',
            'value': token,
            'elements' : []
        }
        if index % 3 == 0
        else 
        parse_bb_block_item(token, parse_pattern)
        for index, token in enumerate(tokens)
        if index % 3 != 2
    ]
    return result

In [288]:
def retokenize_tree_branch_for_items(branch, split_pattern, parse_pattern):

    new_elements = []

    for leaf in branch['elements']:
        if leaf['branch_type'] == 'bbcode':
            new_leafs = tokenize_phpbb_block(leaf['value'], split_pattern, parse_pattern)
        else:
            new_leafs = [retokenize_tree_branch_for_items(leaf, split_pattern, parse_pattern)]
        new_elements.extend(new_leafs.copy())

    branch['elements'] = new_elements

    return branch

In [289]:
print(yaml.dump(retokenize_tree_branch_for_items(block_tree, item_instance_pattern, item_parts_pattern)))

branch_type: root
elements:
- branch_type: bb_code_text
  elements: []
  value: '<!-- s:fight: --><img src="{SMILIES_PATH}/bije.gif" alt=":fight:" title="fight"
    /><!-- s:fight: --> <!-- s:hurra: --><img src="{SMILIES_PATH}/hura.gif" alt=":hurra:"
    title="Hurra!" /><!-- s:hurra: --> <!-- s:olaboga: --><img src="{SMILIES_PATH}/olaboga.gif"
    alt=":olaboga:" title="Olaboga" /><!-- s:olaboga: --> <!-- s:wnerw: --><img src="{SMILIES_PATH}/wnerw.gif"
    alt=":wnerw:" title="wnerw" /><!-- s:wnerw: -->


    <!-- s:fight: --><img src="{SMILIES_PATH}/bije.gif" alt=":fight:" title="fight"
    /><!-- s:fight: --> <!-- s:hurra: --><img src="{SMILIES_PATH}/hura.gif" alt=":hurra:"
    title="Hurra!" /><!-- s:hurra: --> <!-- s:olaboga: --><img src="{SMILIES_PATH}/olaboga.gif"
    alt=":olaboga:" title="Olaboga" /><!-- s:olaboga: --> <!-- s:wnerw: --><img src="{SMILIES_PATH}/wnerw.gif"
    alt=":wnerw:" title="wnerw" /><!-- s:wnerw: -->'
- branch_type: quote
  elements:
  - branch_type: bb_c