In [1]:
import os
import json
from glob import glob

In [2]:
from transformers import GPT2Tokenizer

In [3]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
eos = tokenizer.eos_token

In [4]:
inputpath = 'data/user/suncoasthost/*.json'

In [5]:
fnames = glob(inputpath)

In [6]:
import numpy as np

In [7]:
valid_prop = .1
shuffled_indices = list(np.random.choice(range(len(fnames)), len(fnames), replace=False))
valid_size = max(1, int(valid_prop*len(fnames)))

In [8]:
fnames_shuffled = [fnames[i] for i in shuffled_indices]

In [9]:
fnames_test = fnames_shuffled[:valid_size]
fnames_valid = fnames_shuffled[valid_size:2*valid_size]
fnames_train = fnames_shuffled[2*valid_size:]

In [10]:
def get_qa_string(comment):
    context = 'In subreddit: {subname}\nTitle: {title}\n{body}'.format(
        subname = comment['submission']['subreddit'],
        title = comment['submission']['title'],
        body = comment['submission']['selftext'],
    )
    return '{context}\n\nQ: {q}\nA: {a}'.format(
        context = context,
        q = 'What do you think?' if comment['parent_comment'] is None else comment['parent_comment']['body'],
        a = comment['comment']['body'],
    )

In [11]:
outputpath = 'data/finetune/suncoasthost'
os.makedirs(outputpath, exist_ok=True)

In [12]:
def write_to_text(fnames, outputfname):
    # clear destination
    with open(outputfname, 'w+') as f:
        f.write('')
    
    total = len(fnames)
    i = 0
    for fname in fnames:
        print ('[{}/{}]'.format(i, total))
        i += 1
        with open(fname) as f:
            comment = json.load(f)
        with open(outputfname, 'a+') as f:
            f.write('{body}\n{eos}\n'.format(
                body=get_qa_string(comment),
                eos=eos
            ))

In [13]:
print ('\ntest.txt')
write_to_text(fnames_test, os.path.join(outputpath, 'test.txt'))
print ('\nvalid.txt')
write_to_text(fnames_valid, os.path.join(outputpath, 'valid.txt'))
print ('\ntrain.txt')
write_to_text(fnames_train, os.path.join(outputpath, 'train.txt'))


test.txt
[0/15]
[1/15]
[2/15]
[3/15]
[4/15]
[5/15]
[6/15]
[7/15]
[8/15]
[9/15]
[10/15]
[11/15]
[12/15]
[13/15]
[14/15]

valid.txt
[0/15]
[1/15]
[2/15]
[3/15]
[4/15]
[5/15]
[6/15]
[7/15]
[8/15]
[9/15]
[10/15]
[11/15]
[12/15]
[13/15]
[14/15]

train.txt
[0/129]
[1/129]
[2/129]
[3/129]
[4/129]
[5/129]
[6/129]
[7/129]
[8/129]
[9/129]
[10/129]
[11/129]
[12/129]
[13/129]
[14/129]
[15/129]
[16/129]
[17/129]
[18/129]
[19/129]
[20/129]
[21/129]
[22/129]
[23/129]
[24/129]
[25/129]
[26/129]
[27/129]
[28/129]
[29/129]
[30/129]
[31/129]
[32/129]
[33/129]
[34/129]
[35/129]
[36/129]
[37/129]
[38/129]
[39/129]
[40/129]
[41/129]
[42/129]
[43/129]
[44/129]
[45/129]
[46/129]
[47/129]
[48/129]
[49/129]
[50/129]
[51/129]
[52/129]
[53/129]
[54/129]
[55/129]
[56/129]
[57/129]
[58/129]
[59/129]
[60/129]
[61/129]
[62/129]
[63/129]
[64/129]
[65/129]
[66/129]
[67/129]
[68/129]
[69/129]
[70/129]
[71/129]
[72/129]
[73/129]
[74/129]
[75/129]
[76/129]
[77/129]
[78/129]
[79/129]
[80/129]
[81/129]
[82/129]
[83/129]
[8