In [1]:
import os
import json
from glob import glob
import numpy as np
from transformers import GPT2Tokenizer

In [2]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
eos = tokenizer.eos_token

In [3]:
inputpath = 'data/user/suncoasthost/*.json'

In [4]:
fnames = glob(inputpath)

In [5]:
valid_prop = .1
shuffled_indices = list(np.random.choice(range(len(fnames)), len(fnames), replace=False))
valid_size = max(1, int(valid_prop*len(fnames)))

In [6]:
fnames_shuffled = [fnames[i] for i in shuffled_indices]

In [7]:
fnames_test = fnames_shuffled[:valid_size]
fnames_valid = fnames_shuffled[valid_size:2*valid_size]
fnames_train = fnames_shuffled[2*valid_size:]

In [8]:
def get_qa_string(comment):
    '''format comment as question and answer'''
    context = 'In subreddit: {subname}\nTitle: {title}\n{body}'.format(
        subname = comment['submission']['subreddit'],
        title = comment['submission']['title'],
        body = comment['submission']['selftext'],
    )
    question = 'What do you think?'
    if comment['parent_comment'] is not None:
        question = comment['parent_comment']['body']
    return '{context}\n\nQ: {q}\nA: {a}'.format(
        context = context,
        q = question,
        a = comment['comment']['body'],
    )

In [9]:
outputpath = 'finetune/data/suncoasthost'
os.makedirs(outputpath, exist_ok=True)

In [10]:
def write_to_text(fnames, outputfname, verbose=1):
    # clear destination
    with open(outputfname, 'w+') as f:
        f.write('')
    
    total = len(fnames)
    i = 0
    for fname in fnames:
        if i % 100 == 0 and verbose > 0:
            print ('[{}/{}]'.format(i, total))
        i += 1
        with open(fname) as f:
            comment = json.load(f)
        with open(outputfname, 'a+') as f:
            f.write('{body}\n{eos}\n'.format(
                body=get_qa_string(comment),
                eos=eos
            ))

In [11]:
print ('\ntest.txt')
write_to_text(fnames_test, os.path.join(outputpath, 'test.txt'))
print ('\nvalid.txt')
write_to_text(fnames_valid, os.path.join(outputpath, 'valid.txt'))
print ('\ntrain.txt')
write_to_text(fnames_train, os.path.join(outputpath, 'train.txt'))


test.txt
[0/15]

valid.txt
[0/15]

train.txt
[0/129]
[100/129]
