In [3]:
import pandas as pd
from sqlalchemy import create_engine
import os
import shutil
import yaml
import re
import html
import urllib
from IPython.display import display, clear_output, display_html

In [4]:
with open('post_links.yml', 'r') as f:
    post_pagination = yaml.load(f.read(), Loader=yaml.FullLoader)

with open('smiley_code_map.yml', 'r') as f:
    smiley_code_map = yaml.load(f.read(), Loader=yaml.FullLoader)


In [5]:
def replace_simple_tags(txt, bb_uid):
    tags = {
        'u' : 'U',
        'b' : 'STRONG',
        'i' : 'EM',
        's' : 'STRIKE',
        'sup' : 'SUP',
        'super' : 'SUP',
        'sub' : 'SUB',
        'code' : 'PRE',
        '*' : 'LI'
    }
    for t_before, t_after in  tags.items():

        start_tag_before = f'[{t_before}:{bb_uid}]'
        start_tag_after = f'<{t_after}>'
        end_tag_before = f'[/{t_before}:{bb_uid}]'
        end_tag_before_m = f'[/{t_before}:m:{bb_uid}]'
        end_tag_before_u = f'[/{t_before}:u:{bb_uid}]'
        end_tag_before_o = f'[/{t_before}:o:{bb_uid}]'
        end_tag_after = f'</{t_after}>'
        
        txt = txt \
            .replace(start_tag_before, start_tag_after) \
            .replace(end_tag_before, end_tag_after) \
            .replace(end_tag_before_m, end_tag_after) \
            .replace(end_tag_before_u, end_tag_after) \
            .replace(end_tag_before_o, end_tag_after)
            
    return txt


In [6]:
def replace_linebreaks(txt, bb_uid):
    # first, remove excess lien breaks after selected end-tags
    rep1 =  r'\1' + f'[br:{bb_uid}]'
    txt = re.sub(f'(\[\/(list|quote|code|\*)(\:(m|o|u))?\:{bb_uid}\])\n', rep1, txt)
    # ... and after selected start tags
    txt = re.sub(f'(\[(list|quote|code|\*)(=[^\:]+)?\:{bb_uid}\])\n', rep1, txt)
    # then, change all other lien breaks to BRs
    txt = re.sub('(?<!\n)\n(?!\n)', '<BR />\n', txt)
    txt = re.sub('(?<!\n)\n\n+(?!\n)', '<BR />\n<BR />\n', txt)
    # then restore the linebreaks after selected tags for readability
    txt = re.sub(f'\[br\:{bb_uid}]', '\n', txt)
    return txt


In [7]:
def replace_quotes(txt, bb_uid):

    txt = re.sub(
        f'\[quote\=(?:\&quot\;)([^\&]+)(?:\&quot\;):{bb_uid}\]',
        r'<BLOCKQUOTE><P>\n»\1« pisze:<BR />\n',
        txt
    )

    txt = re.sub(
        f'\[quote\:{bb_uid}\]',
        r'<BLOCKQUOTE><P>\ncytat:<BR />\n',
        txt
    )

    txt = re.sub(
        f'\[\/quote(\:(o|u|m))?\:{bb_uid}\]',
        '\n</P></BLOCKQUOTE>',
        txt
    )

    return txt
    

In [8]:
def replace_lists(txt, bb_uid):
    txt = re.sub(
        f'\[list\=(.)\:{bb_uid}\]',
        r'<OL type="\1">',
        txt
    )
    txt = re.sub(
        f'\[list\:{bb_uid}\]',
        r'<UL>',
        txt
    )
    txt = re.sub(
        f'\[\/list\:o\:{bb_uid}\]',
        r'</OL>',
        txt
    )
    txt = re.sub(
        f'\[\/list\:u\:{bb_uid}\]',
        r'</UL>',
        txt
    )

    return txt


In [9]:
def translate_url(url):
    
    print(url)

    new_url = url

    known_hosts = [
        'http://pwgay.org/forum/',
        'http://pwgay.7z9.net/forum/'
        'http://pwgy.vipserv.org/'
    ]
    m = 0
    for host_pattern in known_hosts:
        m += len(re.findall(host_pattern, url))

    if m >0 :

        param_pattern = '[a-zA-Z]+\=[0-9a-zA-Z]+'
        params = re.search(
            f'(?:viewtopic|vievforum).php\?((?:{param_pattern})(?:\&{param_pattern})*)',
            url
        )
        
        if params != None:

            param_arr = params.group(1).split('&')
            param_dict = { p.split('=')[0] : p.split('=')[1] for p in param_arr}

            post_id = param_dict.get('p')
            topic_id = param_dict.get('t')
            forum_id = param_dict.get('f')
            
            if post_id != None:

                link_params = post_pagination.get(int(post_id))

                if link_params != None:
                
                    thread_num = link_params['t']
                    page_num = link_params['p']
                    if page_num == 1:
                        new_url = f'/thread/{thread_num}/index.html#post_{post_id}'
                    else:
                        new_url = f'/thread/{thread_num}/page_{page_num}.html#post_{post_id}'
                
                else:

                    encoded_old_url = urllib.parse.quote(url)
                    new_url = f'/missing_post.html?link={encoded_old_url}'

            elif topic_id != None:

                new_url = f'/threads/{topic_id}/index.html'

            elif forum_id != None:

                new_url = f'/forums/{forum_id}/index.html'
    
    return new_url


In [10]:
def url_tag(url, make_tag='none'):
    url = html.unescape(url)
    url = translate_url(url)
    if make_tag == 'full':
        url = f'<A href="{url}">{url}</A>'
    if make_tag == 'start':
        url = f'<A href="{url}">'
    return url

In [11]:
def replace_url(txt, bb_uid):
    
    # simple URL
    txt = re.split(
       f'\[url\:{bb_uid}\]([^\[]+)\[\/url\:{bb_uid}\]',
       txt
    )
    txt = [ item if index%2 == 0 else url_tag(item, make_tag='full') for index, item in enumerate(txt)]
    txt = ''.join(txt)

    # complex URL
    open_tag = f'\[url\=([^\:]+)\:{bb_uid}\]'
    close_tag = f'\[\/url\:{bb_uid}\]'
    txt = re.split(
        open_tag,
        txt
    )
    txt = [ item if index%2 == 0 else url_tag(item, make_tag='start') for index, item in enumerate(txt)]
    txt = ''.join(txt)
    txt = re.sub(
        close_tag,
        '</A>',
        txt
    )

    return txt

In [12]:
def replace_size(txt, bb_uid):
    pattern_start = f'\[size=([0-9]+)\:{bb_uid}]'
    pattern_end = f'\[\/size\:{bb_uid}]'
    txt = re.sub(
        pattern_start,
        r'<SPAN style="font-size:\1%">',
        txt
    )
    txt = re.sub(
        pattern_end,
        r'</SPAN>',
        txt
    )
    return txt
    

In [13]:
def repalce_color(txt, bb_uid):
    pattern_start = f'\[color=([\#a-zA-Z0-9]+)\:{bb_uid}\]'
    pattern_end = f'\[\/color\:{bb_uid}]'
    txt = re.sub(
        pattern_start,
        r'<SPAN style="color:\1">',
        txt
    )
    txt = re.sub(
        pattern_end,
        r'</SPAN>',
        txt
    )
    return txt

In [14]:
def img_tag(url):

    url = html.unescape(url)
    search_url = f'https://web.archive.org/web/20240000000000*/{url}'
    search_url_safe = html.escape(search_url)
    url_display = url
    if len(url) >50:
        url_display = f'{url[:20]} ... {url[-20:]}'
    url_display = html.escape(url_display)
    tag =f'''<P class="image_holder">
    <IMG src="{url}" /><br />
    <A href="{search_url_safe}" title="poszukaj w internet archive">{url_display}</A>
    </P>'''
    return tag


In [15]:
def replace_img(txt, bb_uid):
   
    txt = re.split(
       f'\[img\:{bb_uid}\]([^\[]+)\[\/img\:{bb_uid}\]',
       txt
    )
    txt = [ item if index%2 == 0 else img_tag(item) for index, item in enumerate(txt)]
    txt = ''.join(txt)

    return txt

In [16]:
def replace_attachment(txt, post_id, bb_uid):
    pattern_start= f'\[attachment=([0-9]+)\:{bb_uid}\]'
    pattern_end = f'\[\/attachment\:{bb_uid}\]'
    txt = re.sub(
        pattern_start,
        f'<A {{% include inline_attachment.html post_id={post_id} ' + r'attachment_index=\1' + ' %} />',
        txt
    )
    txt = re.sub(
        pattern_end,
        '</A>',
        txt
    )
    return txt

In [17]:
def map_smiely_code(code):
    smiley_id = smiley_code_map[code]
    smiley_txt = f'{{% include smiley.html smiely_id={smiley_id} %}}'
    return smiley_txt

In [55]:
def replace_smilies(txt):
    pattern = r'\<\!\-\- s([^\ ]+) \-\-\>\<img +src\=\"\{SMILIES\_PATH\}\/[^\"]+\"(?: +alt\=\"[^\"]+\")?(?: +title\=\"[^"]+\")? \/\>\<\!\-\- s[^ ]+ \-\-\>'
    txt = re.split(
        pattern,
        txt
    )
    print(txt)
    txt = [value if index%2 == 0 else  map_smiely_code(value) for index, value in enumerate(txt)]

    return ''.join(txt)

In [19]:
def tex_tag(txt):
    tex = html.unescape(txt)
    url = urllib.parse.quote(tex)
    tex = tex.replace('\n', '')
    tex = re.sub(' +', ' ', tex)
    tag = '<DIV class="tex">' + \
        f'<IMG src="https://latex.codecogs.com/gif.latex?{url}" alt="{tex}" title="{tex}"/>' + \
        f'<div><CODE>{tex}</CODE></div>' + \
        '</div>'
    return tag

In [20]:
def replace_tex(txt, bb_uid):
    
    txt = re.split(
       f'(\[tex\:{bb_uid}\]|\[\/tex\:{bb_uid}\])\n*',
       txt
    )
    
    txt = [value if index% 4 == 0 else tex_tag(value) for index, value in enumerate(txt) if index%2 == 0]
    txt = ''.join(txt)

    return txt

In [21]:
def yt_tag(txt):

    tag = f'<iframe width="560" height="315" src="https://www.youtube.com/embed/{txt}"></iframe>'
    tag= '<DIV class="youtube_embed">' + \
        f'<iframe  width="560" height="315" src="https://www.youtube.com/embed/{txt}"></iframe>' + \
        f'<div><A href="https://www.youtube.com/watch?v={txt}" title="zobacz na YouTube">' + \
        f'https://www.youtube.com/watch?v={txt}' + \
        '</A></div></DIV>'
    return tag


In [22]:
def replace_youtube(txt, bb_uid):

    txt = re.split(
       f'(\[youtube\:{bb_uid}\]|\[\/youtube\:{bb_uid}\])\n*',
       txt
    )
    
    txt = [value if index% 4 == 0 else yt_tag(value) for index, value in enumerate(txt) if index%2 == 0]
    txt = ''.join(txt)

    return txt


In [23]:
def parese_post_bb_code(text, post_id, bb_uid):
    final_text = replace_tex(text, bb_uid)
    final_text = replace_linebreaks(final_text, bb_uid)
    final_text = replace_simple_tags(final_text, bb_uid)
    final_text = replace_quotes(final_text, bb_uid)
    final_text = replace_lists(final_text, bb_uid)
    final_text = replace_url(final_text, bb_uid)
    final_text = replace_size(final_text, bb_uid)
    final_text = repalce_color(final_text, bb_uid)
    final_text = replace_img(final_text, bb_uid)
    final_text = replace_attachment(final_text, post_id, bb_uid)
    final_text = replace_smilies(final_text)
    final_text = replace_youtube(final_text, bb_uid)
    return final_text
    

In [None]:
def print_progress(progress_log):
    with open('post_processing_log.yml', 'w') as f:
        f.write(yaml.dump(progress_log))
    clear_output(wait=True)
    display(progress_log['last_file_path'])

In [None]:
def process_post(post_content, progress_log):

    post_meta = {k : v for k, v in post_content.items() if k not in ['post_text', 'bbcode_uid']}
    post_text = post_content['post_text']
    post_uid = post_content['bbcode_uid']
    post_id = post_meta['post_id']
    post_forum = post_meta['forum_id']
    post_thread = post_meta['topic_id']
    post_meta['category'] =  post_meta['topic_id']

    post_meta['post_timestamp'] = post_meta['post_timestamp'].to_pydatetime()

    post_text_processed = parese_post_bb_code(post_text, post_id, post_uid)

    post_dir_path = os.path.join('..', '_tests','_forum_posts', f'forum_{post_forum}', f'thread_{post_thread}')
    if not os.path.exists(post_dir_path):
        os.makedirs(post_dir_path)
    post_file_path = os.path.join(post_dir_path, f'post_{post_id}.html')
    with open(post_file_path, 'w') as f:
        f.write('---\n')
        f.write(yaml.dump(post_meta, default_style='"' ))
        f.write('---\n')
        f.write(post_text_processed)

    progress_log['last_post'] = post_id
    progress_log['last_file_path'] = post_file_path
    print_progress(progress_log)
    
    return post_content

In [62]:
with open('post_processing_log.yml', 'r') as f:
    progress = yaml.load(
        f.read(),
        Loader=yaml.FullLoader
    )

with open('forum_checklist.yml', 'r') as f:
    forum_list = yaml.load(
        f.read(),
        Loader=yaml.FullLoader
    )

forum_list.insert(0, 0)

[0, 2, 11, 16, 22, 19, 40, 3, 20, 41, 15, 21, 12, 29, 30, 31]

In [None]:
db = os.environ.get('PSQL_DATABASE')
host = 'localhost'
user ='user'
port = '5432'
psql_engine = f'postgresql://{user}:@{host}:{port}/{db}'

In [None]:
with open('get_all_threads.sql', 'r') as f:
    all_threads_template = f.read()

with open('get_threads_posts.sql', 'r') as f:
    threads_posts_template = f.read()

all_threads_query = all_threads_template.format(
    forums = ', '.join([str(i) for i in forum_list]),
    from_forum = progress['last_forum'],
    from_thread = progress['last_thread'],
    from_post = progress['last_post']
)

all_topics = pd.read_sql(all_threads_query, psql_engine).to_dict(orient='records')

In [None]:
for topic in all_topics:

    progress['last_forum'] = topic['forum_id']
    progress['last_thread'] = topic['topic_id']
    progress['last_post'] = -1
    progress['last_file_path'] = ''
    print_progress(progress)

    topic_posts_query = threads_posts_template.format(
        thread_id = topic['topic_id']
    )

    topic_posts = pd.read_sql(topic_posts_query ,psql_engine)

    for post in topic_posts.iterrows():
       process_post(post[1], progress)
