In [1]:
import re
import html

In [2]:
with open("post.txt", 'r') as f:
    post_text = f.read()
    post_text = html.unescape(post_text)

bb_uid = '17hlnlju'

In [3]:
block_tags = 'quote|code|list|\*'
start_pattern = f'\[((?:{block_tags}).*?)\:{bb_uid}\]'
end_pattern = f'\[\/({block_tags})(?:\:m|\:u|\:o)?\:{bb_uid}\]'

In [26]:
def split_post_into_block_tokens(post_text, start_pattern, end_pattern):
    token_groups = [
            [
                {"type" : "bbcode", "value" : e }
                if i%2 ==0
                else {"type" : "finish", "value" : e}
                for i, e in enumerate(
                    re.split(end_pattern, item)
                )
            ]
            if index%2 == 0
            else [{"type" : "start", "value" : item}]
            for index, item in enumerate(
                re.split(start_pattern, post_text)
            )
        ]
    tokens = []
    [tokens.extend(g) for g in token_groups]
    
    return tokens

In [53]:
re.search('^[ \n\t]*$', None)

TypeError: expected string or bytes-like object

In [61]:
def text_is_empty(txt):
    if txt == None:
        return True
    return re.search('^[ \n\t]*$', txt) != None

In [63]:
def clean_token_list(token_list):
    result = [
        item
        for item in token_list
        if (
            item['type'] != 'bbcode'
            or text_is_empty(item['value'])
        )
    ]
    return result

In [54]:
block_tokens = split_post_into_block_tokens(post_text, start_pattern, end_pattern)
clean_block_tokens =  clean_token_list(block_tokens)

In [121]:
t2 = t.copy()
t2.reverse()

root = {
    "type" : "root",
    "elements" : []
}
branch = root
breadcrumbs = []

pos = 0

while len(t2) > 0:
    item = t2.pop()

    if item['type'] == 'text':
        v = item['value']
        if re.search('^[\n\t ]+$', v) is None:
            branch['elements'].append({
                "type" : "text",
                "value" : v
            })
    
    if item['type'] == 'start':
        item_parts =  item['value'].split('=')
        item_parts.append('')
        leaf = {
            "type" : item_parts[0],
            "value" : item_parts[1],
            "elements" : []
        }
        branch['elements'].append(leaf)
        breadcrumbs.append(branch)
        branch = leaf

    if item['type'] == 'finish':
        branch = breadcrumbs.pop()

    pos += 1

root

{'type': 'root',
 'elements': [{'type': 'text',
   'value': '<!-- s:fight: --><img src="{SMILIES_PATH}/bije.gif" alt=":fight:" title="fight" /><!-- s:fight: -->  <!-- s:hurra: --><img src="{SMILIES_PATH}/hura.gif" alt=":hurra:" title="Hurra!" /><!-- s:hurra: -->  <!-- s:olaboga: --><img src="{SMILIES_PATH}/olaboga.gif" alt=":olaboga:" title="Olaboga" /><!-- s:olaboga: -->  <!-- s:wnerw: --><img src="{SMILIES_PATH}/wnerw.gif" alt=":wnerw:" title="wnerw" /><!-- s:wnerw: --> \n\n <!-- s:fight: --><img src="{SMILIES_PATH}/bije.gif" alt=":fight:" title="fight" /><!-- s:fight: -->  <!-- s:hurra: --><img src="{SMILIES_PATH}/hura.gif" alt=":hurra:" title="Hurra!" /><!-- s:hurra: -->  <!-- s:olaboga: --><img src="{SMILIES_PATH}/olaboga.gif" alt=":olaboga:" title="Olaboga" /><!-- s:olaboga: -->  <!-- s:wnerw: --><img src="{SMILIES_PATH}/wnerw.gif" alt=":wnerw:" title="wnerw" /><!-- s:wnerw: --> \n\n'},
  {'type': 'quote',
   'value': '"trekker"',
   'elements': [{'type': 'text', 'value': 'bold\n

In [51]:
re.search('^[ \n\t]*$', None) != None

TypeError: expected string or bytes-like object

In [122]:
def clean_up_text(txt):
    txt = re.sub(r' +\n', r'\n', txt)
    txt = re.sub(r'\n +', r'\n', txt)
    txt = re.sub(r'\n\n+', r'\n\n', txt)
    txt = re.sub(r'([^\n])\n([^\n])', r'\1\n  \2', txt)
    return txt
    


In [123]:
def render(element, item_prefix = '', parent_element = ''):

    e_type =element['type']

    if e_type == "text":
        t = element['value']
        t = clean_up_text(t)
    
    elif e_type == "b":
        t = ' **'
        for i in element['elements']:
            t += render(i, item_prefix = item_prefix, parent_element = e_type) 
        t += '** '

    elif e_type == "i":
        t = ' _'
        for i in element['elements']:
            t += render(i, item_prefix = item_prefix, parent_element = e_type)
        t += '_ '

    elif e_type == "u":
        t = ' <u>'
        for i in element['elements']:
            t += render(i, item_prefix = item_prefix, parent_element = e_type)
        t += '</u> '

    elif e_type == "url":

        t = '<http://broken.link>'
        inner_text_arr= [i for i in element['elements'] if i['type'] == 'text']
        inner_text = ''
        v = element['value']

        if len(inner_text_arr) > 0 :
                inner_text = inner_text_arr[0]['value']
                
        if v == '':
            # no explicit src -> the inner content needs to be a URL text
            if inner_text != '':
                t = '<' + inner_text + '>'
        else:
            t = f'[{inner_text}]({v})'

    elif e_type =='list':
        
        # before list starts
        t = ''
        if parent_element != 'list':
            t += '\n'

        # for numbered lists
        iterator = 0
        list_type = element['value']
    
        for i in element['elements']:

            if i['type'] == '*':
                
                bullet = '* '
                if list_type == 'a' or list_type == 'A' or list_type == '1':
                    bullet = str(iterator + 1) + '. '

                t +=  item_prefix + f'{bullet}' + render(i, item_prefix = item_prefix, parent_element = e_type)
                iterator += 1

            elif i['type'] == 'list':
                t += render(i, item_prefix = item_prefix + len(bullet) * ' ', parent_element = e_type)

            else:
                t += item_prefix + '  ' + render(i, item_prefix = item_prefix, parent_element = e_type)
        
        if parent_element != 'list':
            t += '\n'

    elif e_type == '*':
        t = ''
        for i in element['elements']:
            t = render(i, item_prefix = item_prefix, parent_element = e_type) + '\n'

    else:
        t = f'[{e_type}]'

    return t   
        

In [124]:
root['elements'][1]['elements'][12]

{'type': 'list',
 'value': '1',
 'elements': [{'type': '*',
   'value': '',
   'elements': [{'type': 'text', 'value': 'eee'}]},
  {'type': '*', 'value': '', 'elements': [{'type': 'text', 'value': 'fff'}]},
  {'type': 'list',
   'value': 'a',
   'elements': [{'type': '*',
     'value': '',
     'elements': [{'type': 'text', 'value': '111'}]},
    {'type': '*', 'value': '', 'elements': [{'type': 'text', 'value': '222'}]},
    {'type': '*',
     'value': '',
     'elements': [{'type': 'text', 'value': '333'}]}]},
  {'type': '*', 'value': '', 'elements': [{'type': 'text', 'value': 'x'}]},
  {'type': 'list',
   'value': 'a',
   'elements': [{'type': '*',
     'value': '',
     'elements': [{'type': 'text', 'value': '111'}]},
    {'type': 'list',
     'value': 'a',
     'elements': [{'type': '*',
       'value': '',
       'elements': [{'type': 'text', 'value': '444'}]},
      {'type': '*',
       'value': '',
       'elements': [{'type': 'text', 'value': '555'}]},
      {'type': '*',
      

In [125]:
md = ''
for i in root['elements'][1]['elements']: 
    md += render(i)

list : 
list : list
list : 
list : 
list : list
list : list
list : list


In [126]:
print(md)

bold

 **lorem** 

italic

 _lorem_ 

underscore

 <u>lorem</u> 

cytaty
[quote][quote]

kod

[code:17hlnlju]lorem[/code:17hlnlju]

listy


* aaa
* bbb
  * aaa
  * bbb
* aaa
* bbb
* aaa
* bbb


1. ccc
2. ddd


1. eee
2. fff
   1. 111
   2. 222
   3. 333
3. x
   1. 111
      1. 444
      2. 555
      3. 666
   2. 333
4. iii

[img]

[youtube:17hlnlju]iBxVRwynmmE[/youtube:17hlnlju]

<http://www.wp.pl>[wu pe pe el](http://www.wp.pl)

[s:17hlnlju]strike[/s:17hlnlju]

X[super:17hlnlju]1[/super:17hlnlju]

Y[sub:17hlnlju]2[/sub:17hlnlju]

[tex:17hlnlju]asad[/tex:17hlnlju]


In [127]:
start_pattern = f'\[quote(?:=\&quot\;([^\&]+)\&quot\;)?:{bb_uid}\]'
end_pattern = f'\[(\/)quote:{bb_uid}\]'
tag_pattern =  start_pattern + '|' + end_pattern

In [128]:
quote_parts = re.split(tag_pattern, post_text)

In [129]:
quote_parts.reverse()
tree = []
level = 0
tree.append({
    "level" : level,
    "author" : '',
    "text" : quote_parts.pop()
})
while len(quote_parts) > 0:
    
    tag_start = quote_parts.pop()
    tag_end = quote_parts.pop()
    tag_txt = quote_parts.pop()
    
    auth = ''

    if tag_end == '/':
        level -= 1    
    else:
        level += 1
        if tag_start is not None:
            auth = tag_start
    
    tree.append({
        "level" : level,
        "author" : auth,
        "text" : tag_txt
    })
    

tree

[{'level': 0,
  'author': '',
  'text': '<!-- s:fight: --><img src="{SMILIES_PATH}/bije.gif" alt=":fight:" title="fight" /><!-- s:fight: -->  <!-- s:hurra: --><img src="{SMILIES_PATH}/hura.gif" alt=":hurra:" title="Hurra!" /><!-- s:hurra: -->  <!-- s:olaboga: --><img src="{SMILIES_PATH}/olaboga.gif" alt=":olaboga:" title="Olaboga" /><!-- s:olaboga: -->  <!-- s:wnerw: --><img src="{SMILIES_PATH}/wnerw.gif" alt=":wnerw:" title="wnerw" /><!-- s:wnerw: --> \n\n <!-- s:fight: --><img src="{SMILIES_PATH}/bije.gif" alt=":fight:" title="fight" /><!-- s:fight: -->  <!-- s:hurra: --><img src="{SMILIES_PATH}/hura.gif" alt=":hurra:" title="Hurra!" /><!-- s:hurra: -->  <!-- s:olaboga: --><img src="{SMILIES_PATH}/olaboga.gif" alt=":olaboga:" title="Olaboga" /><!-- s:olaboga: -->  <!-- s:wnerw: --><img src="{SMILIES_PATH}/wnerw.gif" alt=":wnerw:" title="wnerw" /><!-- s:wnerw: --> \n\n[quote="trekker":17hlnlju]bold\n\n[b:17hlnlju]lorem[/b:17hlnlju]\n\nitalic\n\n[i:17hlnlju]lorem[/i:17hlnlju]\n\nunders

In [130]:
tree

[{'level': 0,
  'author': '',
  'text': '<!-- s:fight: --><img src="{SMILIES_PATH}/bije.gif" alt=":fight:" title="fight" /><!-- s:fight: -->  <!-- s:hurra: --><img src="{SMILIES_PATH}/hura.gif" alt=":hurra:" title="Hurra!" /><!-- s:hurra: -->  <!-- s:olaboga: --><img src="{SMILIES_PATH}/olaboga.gif" alt=":olaboga:" title="Olaboga" /><!-- s:olaboga: -->  <!-- s:wnerw: --><img src="{SMILIES_PATH}/wnerw.gif" alt=":wnerw:" title="wnerw" /><!-- s:wnerw: --> \n\n <!-- s:fight: --><img src="{SMILIES_PATH}/bije.gif" alt=":fight:" title="fight" /><!-- s:fight: -->  <!-- s:hurra: --><img src="{SMILIES_PATH}/hura.gif" alt=":hurra:" title="Hurra!" /><!-- s:hurra: -->  <!-- s:olaboga: --><img src="{SMILIES_PATH}/olaboga.gif" alt=":olaboga:" title="Olaboga" /><!-- s:olaboga: -->  <!-- s:wnerw: --><img src="{SMILIES_PATH}/wnerw.gif" alt=":wnerw:" title="wnerw" /><!-- s:wnerw: --> \n\n[quote="trekker":17hlnlju]bold\n\n[b:17hlnlju]lorem[/b:17hlnlju]\n\nitalic\n\n[i:17hlnlju]lorem[/i:17hlnlju]\n\nunders

In [131]:
lines = []
for item in tree:
    a = item['author']
    l = item['level']
    t = item['text']
    if a != '':
        lines.append((l - 1) * '> ' + f'({a} napisał/a:)')
    for line in t.split('\n'):
        lines.append(l * '>' + ' ' + line)

In [132]:
'aaa'.replace(old, new)

NameError: name 'old' is not defined

x<em>2</em>

In [None]:
lines = [l.replace(f'[b:{bb_uid}]', ' **') for l in lines]
lines = [l.replace(f'[/b:{bb_uid}]', '** ') for l in lines]
lines = [l.replace(f'[u:{bb_uid}]', ' <u>') for l in lines]
lines = [l.replace(f'[/u:{bb_uid}]', '</u> ') for l in lines]
lines = [l.replace(f'[i:{bb_uid}]', ' _') for l in lines]
lines = [l.replace(f'[/i:{bb_uid}]', '_ ') for l in lines]
for l in lines:
    print(l)

In [None]:
items_list = [
    {"l" : 0, "v" : 'a', "type" : 'text'},
    {"l" : 1, "v" : 'b', "type" : 'b'},
    {"l" : 1, "v" : 'c', "type" : 'quote'},
    {"l" : 2, "v" : 'd', "type" : 'quote'},
    {"l" : 1, "v" : 'e', "type" : 'text'},
    {"l" : 2, "v" : 'f'},
    {"l" : 1, "v" : 'g'},
    {"l" : 0, "v" : 'h'},
    {"l" : 1, "v" : 'i'},
    {"l" : 0, "v" : 'j'}
]
items_list.reverse()

In [None]:
root = {
    "type" : 'text',
    "content" : []
}
branch = root
levels = []
current_level = 0

while len(items_list) > 0:
    item = items_list.pop()
    
    if item['l'] == current_level:
        branch['content'].append(
            {
                "type" : 'text',
                "content" : [item['v']]
            }
        )

    if item['l'] > current_level:
        current_level += 1
        
        new_branch = {
            "type" : "text",
            "content" : []
        }
        new_branch['content'].append(item['v'])
        
        branch['content'].append(new_branch)
        levels.append(branch)
        branch = new_branch
    
    if item['l'] < current_level:
        current_level -= 1
        branch = levels.pop()
        branch['content'].append(item['v'])


root
