https://huggingface.co/transformers/quickstart.html

https://github.com/huggingface/transformers/tree/master/examples/language-modeling

https://github.com/huggingface/transformers/blob/master/examples/language-modeling/run_language_modeling.py

https://github.com/huggingface/transformers/blob/master/src/transformers/training_args.py

In [1]:
import os
import json
from glob import glob
import numpy as np
from transformers import (
    GPT2Tokenizer, 
    GPT2LMHeadModel, 
    TextDataset, 
    PreTrainedTokenizer, 
    DataCollatorForLanguageModeling,
    TrainingArguments, 
    Trainer
)

In [2]:
gpt2pretrainedstr = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(gpt2pretrainedstr)
model = GPT2LMHeadModel.from_pretrained(gpt2pretrainedstr)
eos = tokenizer.eos_token

# Model Data

In [3]:
inputpath = 'data/user/suncoasthost/*.json'

fnames = glob(inputpath)
valid_prop = .1
shuffled_indices = list(np.random.choice(range(len(fnames)), len(fnames), replace=False))
valid_size = max(1, int(valid_prop*len(fnames)))

fnames_shuffled = [fnames[i] for i in shuffled_indices]
fnames_test = fnames_shuffled[:valid_size]
fnames_valid = fnames_shuffled[valid_size:2*valid_size]
fnames_train = fnames_shuffled[2*valid_size:]

In [4]:
def get_qa_string(comment):
    '''format comment as question and answer'''
    context = 'In subreddit: {subname}\nTitle: {title}\n{body}'.format(
        subname = comment['submission']['subreddit'],
        title = comment['submission']['title'],
        body = comment['submission']['selftext'],
    )
    question = 'What do you think?'
    if comment['parent_comment'] is not None:
        question = comment['parent_comment']['body']
    return '{context}\n\nQ: {q}\nA: {a}'.format(
        context = context,
        q = question,
        a = comment['comment']['body'],
    )

def write_to_text(fnames, outputfname, verbose=1):
    # clear destination
    with open(outputfname, 'w+') as f:
        f.write('')
    
    total = len(fnames)
    i = 0
    for fname in fnames:
        if i % 100 == 0 and verbose > 0:
            print ('[{}/{}]'.format(i, total))
        i += 1
        with open(fname) as f:
            comment = json.load(f)
        with open(outputfname, 'a+') as f:
            f.write('{body}\n{eos}\n'.format(
                body=get_qa_string(comment),
                eos=eos
            ))

In [5]:
modeldatapath = 'finetune/suncoasthost/data/'
os.makedirs(modeldatapath, exist_ok=True)

In [6]:
file_path_train = os.path.join(modeldatapath, 'train.txt')
file_path_valid = os.path.join(modeldatapath, 'valid.txt')
file_path_test = os.path.join(modeldatapath, 'test.txt')

In [7]:
write_to_text(fnames_train, file_path_train)
write_to_text(fnames_valid, file_path_valid)
write_to_text(fnames_test, file_path_test)

[0/129]
[100/129]
[0/15]
[0/15]


# Model Training

In [8]:
tokenizer.max_len

1024

In [9]:
def get_dataset(file_path, tokenizer: PreTrainedTokenizer):
    return TextDataset(
        tokenizer=tokenizer, 
        file_path=file_path, 
        #block_size=tokenizer.max_len, 
        block_size=128,
        overwrite_cache=True,
    )

In [10]:
train_dataset = get_dataset(file_path_train, tokenizer=tokenizer)
valid_dataset = get_dataset(file_path_valid, tokenizer=tokenizer)
test_dataset = get_dataset(file_path_test, tokenizer=tokenizer)

In [11]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [12]:
modeloutputpath = 'finetune/suncoasthost/model/'

training_args = TrainingArguments(
    output_dir=modeloutputpath,
    do_train=True,
    do_eval=True,
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    prediction_loss_only=True,
)

In [14]:
trainer.train()
trainer.save_model()

HBox(children=(FloatProgress(value=0.0, description='Epoch', max=3.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Iteration', max=44.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=44.0, style=ProgressStyle(description_wid…




HBox(children=(FloatProgress(value=0.0, description='Iteration', max=44.0, style=ProgressStyle(description_wid…





In [15]:
trainer.evaluate()

HBox(children=(FloatProgress(value=0.0, description='Evaluation', max=5.0, style=ProgressStyle(description_wid…


{"eval_loss": 2.838185119628906, "epoch": 3.0, "step": 132}


# Model Prediction

In [17]:
from transformers import pipeline

In [18]:
predictor = pipeline('text-generation', model=modeloutputpath, tokenizer=tokenizer)

In [20]:
predictor('In subreddit: node\nTitle: Do you like Trump?\n{body}\n\nQ: What do you think?\nA: ')[0]['generated_text']

Setting `pad_token_id` to 50256 (first `eos_token_id`) to generate sequence


'In subreddit: node\nTitle: Do you like Trump?\n{body}\n\nQ: What do you think?\nA:  Have you been listening on Google or any other similar search engine and just want to help the audience learn how to'