In [6]:
from tqdm import tqdm_notebook as tqdm
import json
import pickle
from pathlib import Path
import itertools

In [7]:
def get_id_for_comments(thing):
    if thing["type"] == "submission":
        return "t3_" + thing["id"]
    else:
        return "t1_" + thing["id"]

    


In [8]:
def format_reddit_thing(thing, submission_id):
    """Format a dict of comment or submisson data."""

    if thing["type"] == "submission":
        text = "\n".join([thing["title"], thing.get("selftext", "")])
    else:
        text = thing["body"]
    text = html.unescape(text)
    return text

In [71]:
def format_submission(thing):
    return f"""****S
{thing.get('url','')}
{thing['title']}
{thing.get('selftext', '')}
****ES {normalize_id(thing['id'])}

"""


def format_top_level_comment(thing):
    return f"""****T {normalize_id(thing['parent_id'])}
{thing['body']}
****ET {thing['id']}

"""


def format_reply(thing):
    return f"""****R {normalize_id(thing['parent_id'])}
{thing['body']}
****ER {thing['id']}

"""

In [134]:
import collections
def normalize_id(iid):
    return iid.split('_')[-1]

def format_thing(thing, submission_id):
    submission_id = submission_id.split('_')[-1]
    if thing['id']==normalize_id(submission_id):
        return format_submission(thing)
    elif normalize_id(thing['parent_id'])==normalize_id(submission_id):
        return format_top_level_comment(thing)
    else:
        return format_reply(thing)
    
def comments_to_queue(comment_dict, submission):
    queue = [submission]
    num_inputs = len(list(itertools.chain(*comment_dict.values())))
    while len(list(itertools.chain(*comment_dict.values()))) > 0:
        for queue_position in range(len(queue) - 1, -1, -1):
#             print('queue_position', queue_position)
            current_id = get_id_for_comments(queue[queue_position])
            found = comment_dict[current_id]
            if len(found):
                break
        next_comment = comment_dict[current_id].pop()
        queue.append(next_comment)
    assert len(queue)==num_inputs+1
    
#     # Should be no dups
#     ids = [x['id'] for x in queue]
#     count = collections.Counter(ids)
#     assert len(ids)==len(set(ids)), f'should be unique but {count.most_common()}'
    return queue

In [135]:
# data = pickle.load(pkl_file.open('rb'))    
# comment_dict = data['comment_dict']
# submission = data['submission']
# ids = [x['id'] for x in itertools.chain(*comment_dict.values())]
# count = collections.Counter(ids)
# count

In [136]:
data = pickle.load(pkl_file.open('rb'))    
comment_dict = data['comment_dict']
submission = data['submission']
queue = comments_to_queue(comment_dict, submission)

submission_id = queue[0]['id']
text = ''
for thing in queue:
    text += format_thing(thing, submission_id)
print(text)

****S
https://www.reddit.com/r/shittyaskscience/comments/835qul/i_sent_a_nigerian_prince_100_cause_he_said_hell/
I sent a Nigerian Prince $100 cause he said he'll then send $100,000 back. What are the finance behind it and when should I expect my money back?
[removed]
****ES 835qul

****T 835qul
Simple: exchange rates. you send him $100USD, which comes out to 36231NGN. This in turn is worth 10742JPY, which will be turned into $129.58CND. See where this is going? Already we have $30 more than we started with! 

This goes on and on, through all sorts of currencies -- many of which we rarely even hear of! Eventually, though, you will receive that 1000x return.
****ET dvfbynh

****R dvfbynh
I actually googled it and you'd end up with roughly 70 cents more if at the end you converted to USD instead of CAD
****ER dvfiq41

****R dvfiq41
But what if you do that a million times?
****ER dvfyojk

****R dvfbynh
In economics this is a term called arbitrary
****ER dvfgx61

****R dvfbynh
So does that

In [137]:
import random
from sklearn.model_selection import train_test_split
data_dir = Path('./data/reddit_threads/')
subreddits = [sub.name for sub in data_dir.glob('*/')]
for sub in subreddits:
    pkl_files = sorted(data_dir.glob(sub+'/*.pickle'))
    if len(pkl_files)>200:
        # split
        train_files, test_files = train_test_split(
            pkl_files, test_size=0.1, random_state=42
        )
        train_files, valid_files = train_test_split(
            train_files, test_size=0.1, random_state=42
        )
        splits = dict(train=train_files, valid=valid_files, test=test_files)

        for split, files in splits.items():
            text = ''
            for pkl_file in tqdm(files):
                data = pickle.load(pkl_file.open('rb'))    
                comment_dict = data['comment_dict']
                submission = data['submission']

                # Sort comments by their parent id
                try:
                    queue = comments_to_queue(comment_dict, submission)
                except IndexError as e:
                    print(e)
                    continue
                    

                # Format
                submission_id = queue[0]['id']
                for thing in queue:
                    text += format_thing(thing, submission_id)
            out_file = data_dir.joinpath(f'{sub}_{split}.txt')
            out_file.open('w').write(text)
            print(out_file)

HBox(children=(IntProgress(value=0, max=918), HTML(value='')))

data/reddit_threads/shittyaskscience_train.txt


HBox(children=(IntProgress(value=0, max=102), HTML(value='')))

data/reddit_threads/shittyaskscience_valid.txt


HBox(children=(IntProgress(value=0, max=114), HTML(value='')))

data/reddit_threads/shittyaskscience_test.txt


HBox(children=(IntProgress(value=0, max=956), HTML(value='')))

data/reddit_threads/Scotland_train.txt


HBox(children=(IntProgress(value=0, max=107), HTML(value='')))

data/reddit_threads/Scotland_valid.txt


HBox(children=(IntProgress(value=0, max=119), HTML(value='')))

data/reddit_threads/Scotland_test.txt


HBox(children=(IntProgress(value=0, max=1054), HTML(value='')))

data/reddit_threads/copypasta_train.txt


HBox(children=(IntProgress(value=0, max=118), HTML(value='')))

data/reddit_threads/copypasta_valid.txt


HBox(children=(IntProgress(value=0, max=131), HTML(value='')))

data/reddit_threads/copypasta_test.txt


HBox(children=(IntProgress(value=0, max=1103), HTML(value='')))

data/reddit_threads/machinelearning_train.txt


HBox(children=(IntProgress(value=0, max=123), HTML(value='')))

data/reddit_threads/machinelearning_valid.txt


HBox(children=(IntProgress(value=0, max=137), HTML(value='')))

data/reddit_threads/machinelearning_test.txt


HBox(children=(IntProgress(value=0, max=410), HTML(value='')))

data/reddit_threads/dreams_train.txt


HBox(children=(IntProgress(value=0, max=46), HTML(value='')))

data/reddit_threads/dreams_valid.txt


HBox(children=(IntProgress(value=0, max=51), HTML(value='')))

data/reddit_threads/dreams_test.txt


HBox(children=(IntProgress(value=0, max=1475), HTML(value='')))

data/reddit_threads/singularity_train.txt


HBox(children=(IntProgress(value=0, max=164), HTML(value='')))

data/reddit_threads/singularity_valid.txt


HBox(children=(IntProgress(value=0, max=183), HTML(value='')))

data/reddit_threads/singularity_test.txt


HBox(children=(IntProgress(value=0, max=1206), HTML(value='')))

data/reddit_threads/programmingcirclejerk_train.txt


HBox(children=(IntProgress(value=0, max=134), HTML(value='')))

data/reddit_threads/programmingcirclejerk_valid.txt


HBox(children=(IntProgress(value=0, max=149), HTML(value='')))

data/reddit_threads/programmingcirclejerk_test.txt


HBox(children=(IntProgress(value=0, max=1194), HTML(value='')))

pop from empty list
pop from empty list


KeyboardInterrupt: 

In [139]:
# Make one that is all
for split in splits.keys():
    inputs = data_dir.glob(f'*_{split}.txt')
    out_file = data_dir.joinpath(f'{split}.txt')
    text = ''
    for inp in inputs:
        text += inp.open('r').read()
    out_file.open('w').write(text)
    print(out_file)

data/reddit_threads/train.txt
data/reddit_threads/valid.txt
data/reddit_threads/test.txt
