In [3]:
import os, time
import pickle
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, PreTrainedTokenizer, GPT2LMHeadModel
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

In [4]:
# The code for tokenizing the text and storing it in files is imported here
def tokenize_longform_text(raw_text_paths: list,
                           tokenizer: PreTrainedTokenizer,
                           block_size: int,
                           drop_last=True,
                           overlap=True):
    """ Loads raw LONGFORM text from a list of paths to text files, tokenizes it, splits the tokenized
     text into training examples and returns the list. Requires passing in a HuggingFace Transformers
     pretrained tokenizer"""

    # TODO: Look into methods of text augmentation, put this in as a placeholder

    # find correct block size of the tokenizer
    block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

    # check that all the text file paths actually files
    for text_file in raw_text_paths:
        assert os.path.isfile(text_file), "{} is not a file".format(text_file)

    # make empty list to store all the examples
    examples = []

    # loop over all text files
    for text_file in raw_text_paths:

        with open(text_file, encoding="utf-8") as f:
            try:
                text = f.read()
                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
                print("{} successfully read and tokenized".format(text_file))
            except:
                print("Error reading or tokenizing file {}".format(text_file))

        # check that the tokenized file is at least one block size long
        len_tokens = len(tokenized_text)
        print(len_tokens)
        if len_tokens < block_size:
            print("File {} is too short for the block size".format(text_file))
            pass

        try:
            if overlap is False:
                for i in range(0, len_tokens - block_size + 1, block_size):  # don't overlap examples
                    examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i + block_size]))
            else:  # overlap examples
                for i in range(0, len_tokens - block_size + 1, int(block_size / 2)):
                    examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i + block_size]))

            if drop_last is False:
                examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[-block_size:]))

            print("Successfully split tokens from file {} into examples".format(text_file))
        except:
            print("Failed at splitting tokens from file {} into examples".format(text_file))

    print("{} examples total tokenized".format(len(examples)))

    return examples

In [28]:
def make_tokenized_examples(tokenizer: PreTrainedTokenizer,
                            block_size: int,
                            root_dir_path: str,
                            examples_file=None):
    """Tokenize a directory of text where the raw text is in a subdirectory '/raw_text' and the default
    is to put the tokenized text into a sub directory '/tokenized_examples' """

    # -------------------------------------------------------------------------------------------------------------
    # Do a bunch of error checking, finding text files and making output file names
    # -------------------------------------------------------------------------------------------------------------

    # check that root_dir_path is actually a path
    assert os.path.isdir(root_dir_path), "{} is not a directory".format(root_dir_path)
    # and check that we didn't accidentally put a / at the end of the dir path
    if not os.path.split(root_dir_path)[1]:
        root_dir_path = os.path.split(root_dir_path)[0]

    # check that there is a raw_text directory
    raw_dir_path = os.path.join(root_dir_path, "raw_text")
    assert os.path.isdir(raw_dir_path), "{} has no raw_text/ subdirectory".format(root_dir_path)

    # check that there are text files in there and if so get their names
    file_list = []
    for file in os.listdir(raw_dir_path):
        if file.endswith(".txt"):
            file_list.append(os.path.join(raw_dir_path, file))
    if len(file_list) == 0:
        raise RuntimeError("No text files found in {}".format(raw_dir_path))

    # now get the tokenized text file name, make the tokenized_text directory if necessary

    if examples_file is None:
        tokenized_dir = os.path.join(root_dir_path, "tokenized_examples")
        if not os.path.isdir(tokenized_dir):
            os.mkdir(tokenized_dir)

        print(root_dir_path)
        print(os.path.split(root_dir_path))

        author_name = os.path.split(root_dir_path)[1]
        examples_file = "examples_gpt2_blocksize_{}_{}.pkl".format(block_size, author_name)
        examples_file = os.path.join(tokenized_dir, examples_file)

    else:
        assert type(examples_file) is str, "tokenized_file_name must be a string or None"
        tokenized_dir = os.path.split(examples_file)[0]
        assert os.path.isdir(tokenized_dir), "{} is not a directory".format(tokenized_dir)

    # -------------------------------------------------------------------------------------------------------------
    # After all that make the examples and save them
    # -------------------------------------------------------------------------------------------------------------

    # tokenize all the files, split them into examples and concatenate them
    examples = tokenize_longform_text(file_list, tokenizer, block_size, drop_last=False, overlap=True)

    # save them as a pickle
    with open(examples_file, 'wb') as f:
        pickle.dump(examples, f, protocol=pickle.HIGHEST_PROTOCOL)

    print("{} examples created and saved in {}".format(len(examples), examples_file))

In [8]:
import pandas as pd 
train_stories = pd.read_csv("story_generation_dataset/ROCStories_train.csv", encoding="utf8")
test_stories = pd.read_csv("story_generation_dataset/ROCStories_test.csv", encoding="utf8")

In [31]:
index = 1
for data in train_stories.values[:2000,1:]:
    with open("data/train/"+str(index)+".txt","w") as file:
        for sent in data:
            file.write(sent+ r'\r\n\ '[:-1])
    file.close()
    index += 1

index = 1
for data in train_stories.values[:500,1:]:
    with open("data/test/"+str(index)+".txt","w") as file:
        for sent in data:
            file.write(sent+ r'\r\n\ '[:-1])
    file.close()
    index += 1


In [7]:
# import the tokenizer from the Transformers library
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

Downloading: 100%|██████████| 1.04M/1.04M [00:01<00:00, 779kB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 456k/456k [00:01<00:00, 365kB/s]  
Downloading: 100%|██████████| 665/665 [00:00<00:00, 84.1kB/s]


In [19]:
train_path = 'data/train/'
test_path = 'data/test/'

In [32]:
make_tokenized_examples(gpt2_tokenizer,10, train_path, examples_file=None);

data/train
('data', 'train')
data/train\raw_text\1.txt successfully read and tokenized
64
Failed at splitting tokens from file data/train\raw_text\1.txt into examples
data/train\raw_text\10.txt successfully read and tokenized
45
Failed at splitting tokens from file data/train\raw_text\10.txt into examples
data/train\raw_text\100.txt successfully read and tokenized
50
Failed at splitting tokens from file data/train\raw_text\100.txt into examples
data/train\raw_text\1000.txt successfully read and tokenized
49
Failed at splitting tokens from file data/train\raw_text\1000.txt into examples
data/train\raw_text\1001.txt successfully read and tokenized
32
Failed at splitting tokens from file data/train\raw_text\1001.txt into examples
data/train\raw_text\1002.txt successfully read and tokenized
65
Failed at splitting tokens from file data/train\raw_text\1002.txt into examples
data/train\raw_text\1003.txt successfully read and tokenized
47
Failed at splitting tokens from file data/train\raw_tex

In [33]:
class StoryData(torch.utils.data.Dataset):
    '''This is a class for loading in a list of tokenized gpt2 examples from a list of file paths'''

    def __init__(
            self,
            file_paths: list):

        for fpath in file_paths:
            assert os.path.isfile(fpath), "{} does not exist".format(fpath)

        self.examples = []

        for fpath in file_paths:
            with open(fpath, 'rb') as f:
                examps = pickle.load(f)
            self.examples.extend(examps)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [34]:
file_paths = []
file_paths.append(os.path.join('data/train/', "tokenized_examples/examples_gpt2_blocksize_10_train.pkl"))
print(file_paths)

['data/train/tokenized_examples/examples_gpt2_blocksize_256_train.pkl']


In [35]:
# make the actual instance of the Dataset class using the chosen authors 
story_dataset = StoryData(file_paths)

# make the data loader
#NOTE: the batch_size is 1 because we will be doing gradient accumulation. This is to get around the fact
# that I am using a 8GB RTX 2070 Super GPU which is small
story_dataloader = DataLoader(story_dataset, batch_size =1, shuffle = True)

In [38]:
# Pick a model to train

# distilgpt2 will on my 8GB GPU

model = GPT2LMHeadModel.from_pretrained('distilgpt2')

# gpt2-medium will not train on an 8GB GPU ... but you can generate text with the pre-trained model if you like.
# if you have a larger GPU, say a 24 GB RTX 3090 you may wish to try training though
# model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

# NOTE: look into gradient checkpointing and see if that will allow for training within 8GB

Downloading: 100%|██████████| 762/762 [00:00<00:00, 191kB/s]
Downloading: 100%|██████████| 353M/353M [00:42<00:00, 8.21MB/s] 


In [None]:
# set epochs and batch size
N_EPOCHS = 5
BATCH_SIZE = 8 # note we are actually going to use gradient accumulation because these models are so big

# for the scheduler
LEARNING_RATE = 0.0001 #0.00002
WARMUP_STEPS = 100 # 10000

In [None]:
# put the model on the gpu. note if this doesn't say you're using the gpu this will not train!

device = 'cpu'
if torch.cuda.is_available():
    print('using gpu')
    device = 'cuda'
print('device:',device)

model.to(device)

# create optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# make scheduler (for varying learning rate over time)

Before finetune

In [None]:
# make a function to generate text from the model

def generate_test_text(model, max_length=256, input_text=None):
    model.eval() # put the model in eval mode
    if input_text is None:
        input_text = "Once upon a time there was a little mouse."
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt')
    input_ids = input_ids.to('cuda')
    output_ids = model.generate(input_ids, 
                                pad_token_id=gpt2_tokenizer.eos_token_id,
                                max_length=max_length, 
                                do_sample=True, 
                                top_p=0.95, 
                                top_k=60,
                                num_return_sequences=1)

    output_text = gpt2_tokenizer.decode(output_ids[0])
    return output_text

In [None]:
# and do some generation. The output should be different every time you run this cell 
# and of all sorts of topics and styles
prompt ="once upon a time there was a little mouse" 
print(generate_test_text(model,input_text=prompt, max_length=256))

Finetune

In [None]:
# This is the link to the gradient accumulation documentation
# https://pytorch.org/docs/stable/notes/amp_examples.html#gradient-accumulation# 

#TODO: look into gradient checkpointing as well

epoch_loss = 0.0 # used to track loss for each epoch

internal_batch_count = 0 # used to track number of examples within each batch. This is necessary 
                         # because of the gradient accumulation hack (to deal with my 8GB GPU memory)
    
# make FP16 scaler for faster training
scaler = torch.cuda.amp.GradScaler()
    
# put the model into training mode
model.train()

for epoch in range(N_EPOCHS): # iterate over epochs
    
    print("started epoch {}".format(epoch))
    
    for idx, text in enumerate(story_dataloader):  # this data loader is set up to shuffle automatically

        # Do the forward propagation. 
        # Don't forget to put the text onto the gpu.
        # you have to put in labels to get the loss as an output.
        # because GPT is an autogregessive model the input is the output for training purposes
        
        with torch.cuda.amp.autocast():
            outputs = model(text.to(device), labels=(text.to(device)))
        
            # get the loss out so we can do backwards propagation
            loss, logits = outputs[:2]
            loss = loss / BATCH_SIZE 
        
        # do backpropagation. yay autodifferentiation!
        # note the use of the scaler for the FP16 
        scaler.scale(loss).backward()
        
        # keep track of the loss
        epoch_loss = epoch_loss + loss.detach().cpu().numpy()  # need to detach the gradients 
                                                               # because we only care about the numerical value
                                                               # also store the epoch loss on the cpu as numpy
            
        # increment the internal_batch_count
        internal_batch_count = internal_batch_count + 1
        
        # Now, if we have run through a full batch, take some optimizer and gradient steps
        if internal_batch_count == BATCH_SIZE:
            internal_batch_count = 0 # reset this
            
            # take an optimizer step. note the use of the scaler for FP16
            scaler.step(optimizer) 
            scaler.update()
            optimizer.zero_grad() # zero out the gradients in the optimizer
            
            model.zero_grad() # zero out the gradients we've been accumulating in the model
            
            scheduler.step() # take a scheduler step

     # Now that we've gone through an epoch, let's see what the loss is and what some generated text looks like
    
    # put the model into evaluation mode
    model.eval()
    
    # print the loss
    print("Epoch {} has loss {}".format(epoch, epoch_loss))
    # reset the loss
    epoch_loss = 0.0
    
    # uncomment this if you want to print some test text after each epoch
    #print(generate_test_text(model,input_text=prompt))
    
    
    # put the model back in training mode
    model.train()

Generate Story after finetune

In [None]:
# Pick a prompt
prompt ="Once upon a time there was a little mouse" 

# And generate text!
print(generate_test_text(model,input_text=prompt, max_length=256))