https://huggingface.co/transformers/quickstart.html

https://github.com/huggingface/transformers/tree/master/examples/language-modeling

https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py

https://github.com/huggingface/transformers/blob/master/src/transformers/training_args.py

In [None]:
import os
import json
from glob import glob
import numpy as np
from transformers import (
    GPT2Tokenizer, 
    GPT2LMHeadModel, 
    TextDataset, 
    PreTrainedTokenizer, 
    DataCollatorForLanguageModeling,
    TrainingArguments, 
    Trainer
)

In [None]:
gpt2pretrainedstr = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(gpt2pretrainedstr)
model = GPT2LMHeadModel.from_pretrained(gpt2pretrainedstr)
eos = tokenizer.eos_token

# Model Data

In [None]:
inputpath = 'data/user/suncoasthost/*.json'

fnames = glob(inputpath)
valid_prop = .1
shuffled_indices = list(np.random.choice(range(len(fnames)), len(fnames), replace=False))
valid_size = max(1, int(valid_prop*len(fnames)))

fnames_shuffled = [fnames[i] for i in shuffled_indices]
fnames_test = fnames_shuffled[:valid_size]
fnames_valid = fnames_shuffled[valid_size:2*valid_size]
fnames_train = fnames_shuffled[2*valid_size:]

In [None]:
def get_qa_string(comment):
    '''format comment as question and answer'''
    context = 'In subreddit: {subname}\nTitle: {title}\n{body}'.format(
        subname = comment['submission']['subreddit'],
        title = comment['submission']['title'],
        body = comment['submission']['selftext'],
    )
    question = 'What do you think?'
    if comment['parent_comment'] is not None:
        question = comment['parent_comment']['body']
    return '{context}\n\nQ: {q}\nA: {a}'.format(
        context = context,
        q = question,
        a = comment['comment']['body'],
    )

def write_to_text(fnames, outputfname, verbose=1):
    # clear destination
    with open(outputfname, 'w+') as f:
        f.write('')
    
    total = len(fnames)
    i = 0
    for fname in fnames:
        if i % 100 == 0 and verbose > 0:
            print ('[{}/{}]'.format(i, total))
        i += 1
        with open(fname) as f:
            comment = json.load(f)
        with open(outputfname, 'a+') as f:
            f.write('{body}\n{eos}\n'.format(
                body=get_qa_string(comment),
                eos=eos
            ))

In [None]:
modeldatapath = 'finetune/suncoasthost/data/'
os.makedirs(outputpath, exist_ok=True)

In [None]:
file_path_train = os.path.join(modeldatapath, 'train.txt')
file_path_valid = os.path.join(modeldatapath, 'valid.txt')
file_path_test = os.path.join(modeldatapath, 'test.txt')

In [None]:
write_to_text(fnames_train, file_path_train)
write_to_text(fnames_valid, file_path_valid)
write_to_text(fnames_test, file_path_test)

# Model Training

In [None]:
def get_dataset(file_path, tokenizer: PreTrainedTokenizer):
    return TextDataset(
        tokenizer=tokenizer, 
        file_path=file_path, 
        block_size=tokenizer.max_len, 
        overwrite_cache=True,
    )

In [None]:
train_dataset = get_dataset(file_path_train, tokenizer=tokenizer)
train_dataset = get_dataset(file_path_valid, tokenizer=tokenizer)
test_dataset = get_dataset(file_path_eval, tokenizer=tokenizer)

In [None]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [None]:
modeloutputpath = 'finetune/suncoasthost/model/'

training_args = TrainingArguments(
    output_dir=modeloutputpath,
    do_train=True,
    do_eval=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    prediction_loss_only=True,
)

In [None]:
trainer.train()
trainer.save_model()

In [None]:
eval_output = trainer.evaluate()