In [3]:
import re
import html
import yaml

In [5]:
def split_post_into_block_tokens(post_text, start_pattern, end_pattern):
    
    token_groups = [
            [
                {"token_type" : "bbcode", "value" : e }
                if i%2 ==0
                else {"token_type" : "finish", "value" : e}
                for i, e in enumerate(
                    re.split(end_pattern, item)
                )
            ]
            if index%2 == 0
            else [{"token_type" : "start", "value" : item}]
            for index, item in enumerate(
                re.split(start_pattern, post_text)
            )
        ]
    
    tokens = []
    for g in token_groups:
        tokens.extend(g)
    
    return tokens

In [6]:
def text_is_empty(txt):
    if txt == None:
        return True
    return re.search('^[ \n\t]*$', txt) != None

In [7]:
def clean_up_text(txt):
    txt = re.sub(r'\t', ' ', txt) # remove tabs
    txt = re.sub(r'  +', ' ', txt) # deduplicate spaces
    txt = re.sub(r' +\n', '\n', txt) # no spaces before a new line - it is a "space" in an of its own).
    txt = re.sub(r'\n +', '\n', txt) # ditto, for spaces after a new line
    txt = re.sub(r'\n\n+', '\n\n', txt) # more than oen new line is at max. 2 new lines
    txt = re.sub(r'([^\n])\n([^\n])', r'\1\n  \2', txt)
    return txt

In [8]:
def clean_token_list(token_list):
    result = [
        item
        for item in token_list
        if (
            item['token_type'] != 'bbcode'
            or not text_is_empty(item['value'])
        )
    ]
    return result

In [9]:
def build_block_tree(tokens):

    block_tree = {
        'branch_type' : 'root',
        'elements' : []
    }
    curr_branch = block_tree
    breadcrumbs = []

    for pos in range(0,len(tokens)):

        token = tokens[pos]
        token_type = token['token_type']
        token_value = token['value']

        # stay on the same tree level
        if token_type == 'bbcode':

            leaf = {
                "branch_type" : token_type,
                "value" : token_value,
                "elements" : []
            }

            curr_branch['elements'].append(leaf)

        # embed a lower level
        if token_type == 'start':

            item_parts =  token_value.split('=')
            item_type = item_parts[0]
            item_val = ''
            if len(item_parts) > 1:
                item_val = item_parts[1]

            leaf = {
                "branch_type" : item_type,
                "value" : item_val,
                "elements" : []
            }

            curr_branch['elements'].append(leaf)
            breadcrumbs.append(curr_branch)
            curr_branch = leaf
            

        # roll-up to a higher layer
        if token['token_type'] == 'finish':

            # check if last opening and current closing tags match
            containing_branch_tag = curr_branch['branch_type']
            if containing_branch_tag == token_value:
                curr_branch = breadcrumbs.pop()

    return block_tree

In [25]:
with open("post.txt", 'r') as f:
    post_text = f.read()
    post_text = html.unescape(post_text)

bb_uid = '17hlnlju'

In [11]:
block_tags = 'quote|code|list|\*'
start_pattern = f'\[((?:{block_tags}).*?)\:{bb_uid}\]'
end_pattern = f'\[\/({block_tags})(?:\:m|\:u|\:o)?\:{bb_uid}\]'

In [12]:
block_tokens = split_post_into_block_tokens(post_text, start_pattern, end_pattern)
clean_block_tokens =  clean_token_list(block_tokens)

In [13]:
block_tree = build_block_tree(clean_block_tokens)

In [80]:
text = '''
aaa[img:17hlnlju]https&#58;//v&#46;wpimg&#46;pl/MDc0OC5wYiUCUjlwGgxvMEEKbSpcVWFmFhJ1YRpFfHwbAXxzGgQqMxMeODNWEyNqE1xgIgdHdCZWBXl1DUYvJloELnZTRSl9U1N3IQVDfHRUBHdtRRkqZh4[/img:17hlnlju]bbb[youtube:17hlnlju]iBxVRwynmmE[/youtube:17hlnlju]ccc[url:17hlnlju]http&#58;//www&#46;wp&#46;pl[/url:17hlnlju]ddd[url=http&#58;//www&#46;wp&#46;pl:17hlnlju]wu pe pe el[/url:17hlnlju]eee
'''

In [23]:
re.findall(r'(aa|bb|cc)(.*?)\1', 'asdvdbtaaasfvetbybberfgaadevbtcc')

[('aa', 'asfvetbybberfg')]

In [94]:
item_tags = 'url|img|youtube'
item_pattern_str = f'(?:\[(?:({item_tags}))=?([^\]]*)\:{bb_uid}\])(.*?)\[/[^\]]*\]'
item_pattern_str = f'(?<=\[)({item_tags})(?==?)([^\]]*)(?=\:{bb_uid}\])(.*?)(?=\[/[^\]]*\])'
item_pattern = re.compile(item_pattern_str)
re.findall(item_pattern, text)

[('img',
  '',
  ':17hlnlju]https&#58;//v&#46;wpimg&#46;pl/MDc0OC5wYiUCUjlwGgxvMEEKbSpcVWFmFhJ1YRpFfHwbAXxzGgQqMxMeODNWEyNqE1xgIgdHdCZWBXl1DUYvJloELnZTRSl9U1N3IQVDfHRUBHdtRRkqZh4'),
 ('youtube', '', ':17hlnlju]iBxVRwynmmE'),
 ('url', '', ':17hlnlju]http&#58;//www&#46;wp&#46;pl'),
 ('url', '=http&#58;//www&#46;wp&#46;pl', ':17hlnlju]wu pe pe el')]

In [124]:
root['elements'][1]['elements'][12]

{'type': 'list',
 'value': '1',
 'elements': [{'type': '*',
   'value': '',
   'elements': [{'type': 'text', 'value': 'eee'}]},
  {'type': '*', 'value': '', 'elements': [{'type': 'text', 'value': 'fff'}]},
  {'type': 'list',
   'value': 'a',
   'elements': [{'type': '*',
     'value': '',
     'elements': [{'type': 'text', 'value': '111'}]},
    {'type': '*', 'value': '', 'elements': [{'type': 'text', 'value': '222'}]},
    {'type': '*',
     'value': '',
     'elements': [{'type': 'text', 'value': '333'}]}]},
  {'type': '*', 'value': '', 'elements': [{'type': 'text', 'value': 'x'}]},
  {'type': 'list',
   'value': 'a',
   'elements': [{'type': '*',
     'value': '',
     'elements': [{'type': 'text', 'value': '111'}]},
    {'type': 'list',
     'value': 'a',
     'elements': [{'type': '*',
       'value': '',
       'elements': [{'type': 'text', 'value': '444'}]},
      {'type': '*',
       'value': '',
       'elements': [{'type': 'text', 'value': '555'}]},
      {'type': '*',
      

In [125]:
md = ''
for i in root['elements'][1]['elements']: 
    md += render(i)

list : 
list : list
list : 
list : 
list : list
list : list
list : list


In [126]:
print(md)

bold

 **lorem** 

italic

 _lorem_ 

underscore

 <u>lorem</u> 

cytaty
[quote][quote]

kod

[code:17hlnlju]lorem[/code:17hlnlju]

listy


* aaa
* bbb
  * aaa
  * bbb
* aaa
* bbb
* aaa
* bbb


1. ccc
2. ddd


1. eee
2. fff
   1. 111
   2. 222
   3. 333
3. x
   1. 111
      1. 444
      2. 555
      3. 666
   2. 333
4. iii

[img]

[youtube:17hlnlju]iBxVRwynmmE[/youtube:17hlnlju]

<http://www.wp.pl>[wu pe pe el](http://www.wp.pl)

[s:17hlnlju]strike[/s:17hlnlju]

X[super:17hlnlju]1[/super:17hlnlju]

Y[sub:17hlnlju]2[/sub:17hlnlju]

[tex:17hlnlju]asad[/tex:17hlnlju]


In [127]:
start_pattern = f'\[quote(?:=\&quot\;([^\&]+)\&quot\;)?:{bb_uid}\]'
end_pattern = f'\[(\/)quote:{bb_uid}\]'
tag_pattern =  start_pattern + '|' + end_pattern

In [128]:
quote_parts = re.split(tag_pattern, post_text)

In [129]:
quote_parts.reverse()
tree = []
level = 0
tree.append({
    "level" : level,
    "author" : '',
    "text" : quote_parts.pop()
})
while len(quote_parts) > 0:
    
    tag_start = quote_parts.pop()
    tag_end = quote_parts.pop()
    tag_txt = quote_parts.pop()
    
    auth = ''

    if tag_end == '/':
        level -= 1    
    else:
        level += 1
        if tag_start is not None:
            auth = tag_start
    
    tree.append({
        "level" : level,
        "author" : auth,
        "text" : tag_txt
    })
    

tree

[{'level': 0,
  'author': '',
  'text': '<!-- s:fight: --><img src="{SMILIES_PATH}/bije.gif" alt=":fight:" title="fight" /><!-- s:fight: -->  <!-- s:hurra: --><img src="{SMILIES_PATH}/hura.gif" alt=":hurra:" title="Hurra!" /><!-- s:hurra: -->  <!-- s:olaboga: --><img src="{SMILIES_PATH}/olaboga.gif" alt=":olaboga:" title="Olaboga" /><!-- s:olaboga: -->  <!-- s:wnerw: --><img src="{SMILIES_PATH}/wnerw.gif" alt=":wnerw:" title="wnerw" /><!-- s:wnerw: --> \n\n <!-- s:fight: --><img src="{SMILIES_PATH}/bije.gif" alt=":fight:" title="fight" /><!-- s:fight: -->  <!-- s:hurra: --><img src="{SMILIES_PATH}/hura.gif" alt=":hurra:" title="Hurra!" /><!-- s:hurra: -->  <!-- s:olaboga: --><img src="{SMILIES_PATH}/olaboga.gif" alt=":olaboga:" title="Olaboga" /><!-- s:olaboga: -->  <!-- s:wnerw: --><img src="{SMILIES_PATH}/wnerw.gif" alt=":wnerw:" title="wnerw" /><!-- s:wnerw: --> \n\n[quote="trekker":17hlnlju]bold\n\n[b:17hlnlju]lorem[/b:17hlnlju]\n\nitalic\n\n[i:17hlnlju]lorem[/i:17hlnlju]\n\nunders

In [130]:
tree

[{'level': 0,
  'author': '',
  'text': '<!-- s:fight: --><img src="{SMILIES_PATH}/bije.gif" alt=":fight:" title="fight" /><!-- s:fight: -->  <!-- s:hurra: --><img src="{SMILIES_PATH}/hura.gif" alt=":hurra:" title="Hurra!" /><!-- s:hurra: -->  <!-- s:olaboga: --><img src="{SMILIES_PATH}/olaboga.gif" alt=":olaboga:" title="Olaboga" /><!-- s:olaboga: -->  <!-- s:wnerw: --><img src="{SMILIES_PATH}/wnerw.gif" alt=":wnerw:" title="wnerw" /><!-- s:wnerw: --> \n\n <!-- s:fight: --><img src="{SMILIES_PATH}/bije.gif" alt=":fight:" title="fight" /><!-- s:fight: -->  <!-- s:hurra: --><img src="{SMILIES_PATH}/hura.gif" alt=":hurra:" title="Hurra!" /><!-- s:hurra: -->  <!-- s:olaboga: --><img src="{SMILIES_PATH}/olaboga.gif" alt=":olaboga:" title="Olaboga" /><!-- s:olaboga: -->  <!-- s:wnerw: --><img src="{SMILIES_PATH}/wnerw.gif" alt=":wnerw:" title="wnerw" /><!-- s:wnerw: --> \n\n[quote="trekker":17hlnlju]bold\n\n[b:17hlnlju]lorem[/b:17hlnlju]\n\nitalic\n\n[i:17hlnlju]lorem[/i:17hlnlju]\n\nunders

In [131]:
lines = []
for item in tree:
    a = item['author']
    l = item['level']
    t = item['text']
    if a != '':
        lines.append((l - 1) * '> ' + f'({a} napisał/a:)')
    for line in t.split('\n'):
        lines.append(l * '>' + ' ' + line)

In [132]:
'aaa'.replace(old, new)

NameError: name 'old' is not defined

x<em>2</em>

In [None]:
lines = [l.replace(f'[b:{bb_uid}]', ' **') for l in lines]
lines = [l.replace(f'[/b:{bb_uid}]', '** ') for l in lines]
lines = [l.replace(f'[u:{bb_uid}]', ' <u>') for l in lines]
lines = [l.replace(f'[/u:{bb_uid}]', '</u> ') for l in lines]
lines = [l.replace(f'[i:{bb_uid}]', ' _') for l in lines]
lines = [l.replace(f'[/i:{bb_uid}]', '_ ') for l in lines]
for l in lines:
    print(l)

In [None]:
items_list = [
    {"l" : 0, "v" : 'a', "type" : 'text'},
    {"l" : 1, "v" : 'b', "type" : 'b'},
    {"l" : 1, "v" : 'c', "type" : 'quote'},
    {"l" : 2, "v" : 'd', "type" : 'quote'},
    {"l" : 1, "v" : 'e', "type" : 'text'},
    {"l" : 2, "v" : 'f'},
    {"l" : 1, "v" : 'g'},
    {"l" : 0, "v" : 'h'},
    {"l" : 1, "v" : 'i'},
    {"l" : 0, "v" : 'j'}
]
items_list.reverse()

In [None]:
root = {
    "type" : 'text',
    "content" : []
}
branch = root
levels = []
current_level = 0

while len(items_list) > 0:
    item = items_list.pop()
    
    if item['l'] == current_level:
        branch['content'].append(
            {
                "type" : 'text',
                "content" : [item['v']]
            }
        )

    if item['l'] > current_level:
        current_level += 1
        
        new_branch = {
            "type" : "text",
            "content" : []
        }
        new_branch['content'].append(item['v'])
        
        branch['content'].append(new_branch)
        levels.append(branch)
        branch = new_branch
    
    if item['l'] < current_level:
        current_level -= 1
        branch = levels.pop()
        branch['content'].append(item['v'])


root
