<a href="https://colab.research.google.com/github/unicorn-yh/Story-Generation-NLP/blob/main/gpt2-story.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
data_path = '/content/gdrive/MyDrive/story_generation_dataset/'

Mounted at /content/gdrive


In [2]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[K     |████████████████████████████████| 5.8 MB 13.1 MB/s 
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[K     |████████████████████████████████| 7.6 MB 53.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.10.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[K     |████████████████████████████████| 182 kB 72.1 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.11.1 tokenizers-0.13.2 transformers-4.25.1


In [3]:
import os, time
import pickle
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, PreTrainedTokenizer, GPT2LMHeadModel
from transformers.optimization import AdamW, get_linear_schedule_with_warmup

In [4]:
# The code for tokenizing the text and storing it in files is imported here
def tokenize_longform_text(raw_text_paths: list,
                           tokenizer: PreTrainedTokenizer,
                           block_size: int,
                           drop_last=True,
                           overlap=True):
    """ Loads raw LONGFORM text from a list of paths to text files, tokenizes it, splits the tokenized
     text into training examples and returns the list. Requires passing in a HuggingFace Transformers
     pretrained tokenizer"""

    # TODO: Look into methods of text augmentation, put this in as a placeholder

    # find correct block size of the tokenizer
    block_size = block_size - (tokenizer.model_max_length - tokenizer.max_len_single_sentence)

    # check that all the text file paths actually files
    for text_file in raw_text_paths:
        assert os.path.isfile(text_file), "{} is not a file".format(text_file)

    # make empty list to store all the examples
    examples = []

    # loop over all text files
    for text_file in raw_text_paths:

        with open(text_file, encoding="utf-8") as f:
            try:
                text = f.read()
                tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
                print("{} successfully read and tokenized".format(text_file))
            except:
                print("Error reading or tokenizing file {}".format(text_file))

        # check that the tokenized file is at least one block size long
        len_tokens = len(tokenized_text)
        print(len_tokens)
        if len_tokens < block_size:
            print("File {} is too short for the block size".format(text_file))
            pass

        try:
            if overlap is False:
                for i in range(0, len_tokens - block_size + 1, block_size):  # don't overlap examples
                    examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i + block_size]))
            else:  # overlap examples
                for i in range(0, len_tokens - block_size + 1, int(block_size / 2)):
                    examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i + block_size]))

            if drop_last is False:
                examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[-block_size:]))

            print("Successfully split tokens from file {} into examples".format(text_file))
        except:
            print("Failed at splitting tokens from file {} into examples".format(text_file))

    print("{} examples total tokenized".format(len(examples)))

    return examples

In [5]:
def make_tokenized_examples(tokenizer: PreTrainedTokenizer,
                            block_size: int,
                            root_dir_path: str,
                            examples_file=None):
    """Tokenize a directory of text where the raw text is in a subdirectory '/raw_text' and the default
    is to put the tokenized text into a sub directory '/tokenized_examples' """

    # -------------------------------------------------------------------------------------------------------------
    # Do a bunch of error checking, finding text files and making output file names
    # -------------------------------------------------------------------------------------------------------------

    # check that root_dir_path is actually a path
    assert os.path.isdir(root_dir_path), "{} is not a directory".format(root_dir_path)
    # and check that we didn't accidentally put a / at the end of the dir path
    if not os.path.split(root_dir_path)[1]:
        root_dir_path = os.path.split(root_dir_path)[0]

    # check that there is a raw_text directory
    raw_dir_path = os.path.join(root_dir_path, "raw_text")
    assert os.path.isdir(raw_dir_path), "{} has no raw_text/ subdirectory".format(root_dir_path)

    # check that there are text files in there and if so get their names
    file_list = []
    for file in os.listdir(raw_dir_path):
        if file.endswith(".txt"):
            file_list.append(os.path.join(raw_dir_path, file))
    if len(file_list) == 0:
        raise RuntimeError("No text files found in {}".format(raw_dir_path))

    # now get the tokenized text file name, make the tokenized_text directory if necessary

    if examples_file is None:
        tokenized_dir = os.path.join(root_dir_path, "tokenized_examples")
        if not os.path.isdir(tokenized_dir):
            os.mkdir(tokenized_dir)

        print(root_dir_path)
        print(os.path.split(root_dir_path))

        author_name = os.path.split(root_dir_path)[1]
        examples_file = "examples_gpt2_blocksize_{}_{}.pkl".format(block_size, author_name)
        examples_file = os.path.join(tokenized_dir, examples_file)

    else:
        assert type(examples_file) is str, "tokenized_file_name must be a string or None"
        tokenized_dir = os.path.split(examples_file)[0]
        assert os.path.isdir(tokenized_dir), "{} is not a directory".format(tokenized_dir)

    # -------------------------------------------------------------------------------------------------------------
    # After all that make the examples and save them
    # -------------------------------------------------------------------------------------------------------------

    # tokenize all the files, split them into examples and concatenate them
    examples = tokenize_longform_text(file_list, tokenizer, block_size, drop_last=False, overlap=True)

    # save them as a pickle
    with open(examples_file, 'wb') as f:
        pickle.dump(examples, f, protocol=pickle.HIGHEST_PROTOCOL)

    print("{} examples created and saved in {}".format(len(examples), examples_file))

In [32]:
import pandas as pd 
train_stories = pd.read_csv(data_path+"ROCStories_train.csv", encoding="utf8")
test_stories = pd.read_csv(data_path+"ROCStories_test.csv", encoding="utf8")

In [7]:
index = 1
for data in train_stories.values[:2000,1:]:
    with open(data_path+"data/train/raw_text/"+str(index)+".txt","w") as file:
        for sent in data:
            file.write(sent+ r'\r\n\ '[:-1])
    file.close()
    index += 1

index = 1
for data in train_stories.values[:500,1:]:
    with open(data_path+"data/test/raw_text/"+str(index)+".txt","w") as file:
        for sent in data:
            file.write(sent+ r'\r\n\ '[:-1])
    file.close()
    index += 1


In [6]:
# import the tokenizer from the Transformers library
gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [7]:
train_path = data_path+'data/train/'
test_path = data_path+'data/test/'
make_tokenized_examples(gpt2_tokenizer,10, train_path, examples_file=None);

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
/content/gdrive/MyDrive/story_generation_dataset/data/train/raw_text/1335.txt successfully read and tokenized
87
Successfully split tokens from file /content/gdrive/MyDrive/story_generation_dataset/data/train/raw_text/1335.txt into examples
/content/gdrive/MyDrive/story_generation_dataset/data/train/raw_text/1336.txt successfully read and tokenized
71
Successfully split tokens from file /content/gdrive/MyDrive/story_generation_dataset/data/train/raw_text/1336.txt into examples
/content/gdrive/MyDrive/story_generation_dataset/data/train/raw_text/1337.txt successfully read and tokenized
74
Successfully split tokens from file /content/gdrive/MyDrive/story_generation_dataset/data/train/raw_text/1337.txt into examples
/content/gdrive/MyDrive/story_generation_dataset/data/train/raw_text/1338.txt successfully read and tokenized
84
Successfully split tokens from file /content/gdrive/MyDrive/story_generation_dataset/data/train/raw

In [8]:
class StoryData(torch.utils.data.Dataset):
    '''This is a class for loading in a list of tokenized gpt2 examples from a list of file paths'''

    def __init__(
            self,
            file_paths: list):

        for fpath in file_paths:
            assert os.path.isfile(fpath), "{} does not exist".format(fpath)

        self.examples = []

        for fpath in file_paths:
            with open(fpath, 'rb') as f:
                examps = pickle.load(f)
            self.examples.extend(examps)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [9]:
file_paths = []
file_paths.append(os.path.join(data_path+'data/train/', "tokenized_examples/examples_gpt2_blocksize_10_train.pkl"))
print(file_paths)

['/content/gdrive/MyDrive/story_generation_dataset/data/train/tokenized_examples/examples_gpt2_blocksize_10_train.pkl']


In [10]:
# make the actual instance of the Dataset class using the chosen authors 
story_dataset = StoryData(file_paths)

# make the data loader
#NOTE: the batch_size is 1 because we will be doing gradient accumulation. This is to get around the fact
# that I am using a 8GB RTX 2070 Super GPU which is small
story_dataloader = DataLoader(story_dataset, batch_size =1, shuffle = True)

In [11]:
# Pick a model to train

# distilgpt2 will on my 8GB GPU

model = GPT2LMHeadModel.from_pretrained('distilgpt2')

# gpt2-medium will not train on an 8GB GPU ... but you can generate text with the pre-trained model if you like.
# if you have a larger GPU, say a 24 GB RTX 3090 you may wish to try training though
# model = GPT2LMHeadModel.from_pretrained('gpt2-medium')

# NOTE: look into gradient checkpointing and see if that will allow for training within 8GB

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/353M [00:00<?, ?B/s]

In [12]:
# set epochs and batch size
N_EPOCHS = 5
BATCH_SIZE = 8 # note we are actually going to use gradient accumulation because these models are so big

# for the scheduler
LEARNING_RATE = 0.0001 #0.00002
WARMUP_STEPS = 100 # 10000

In [13]:
# put the model on the gpu. note if this doesn't say you're using the gpu this will not train!

device = 'cpu'
if torch.cuda.is_available():
    print('using gpu')
    device = 'cuda'
print('device:',device)

model.to(device)

# create optimizer
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

using gpu
device: cuda




In [14]:
# BEFORE
# make a function to generate text from the model

def generate_test_text(model, max_length=256, input_text=None):
    model.eval() # put the model in eval mode
    if input_text is None:
        input_text = "Once upon a time there was a little mouse."
    input_ids = gpt2_tokenizer.encode(input_text, return_tensors='pt')
    input_ids = input_ids.to('cuda')
    output_ids = model.generate(input_ids, 
                                pad_token_id=gpt2_tokenizer.eos_token_id,
                                max_length=max_length, 
                                do_sample=True, 
                                top_p=0.95, 
                                top_k=60,
                                num_return_sequences=1)

    output_text = gpt2_tokenizer.decode(output_ids[0])
    return output_text

In [15]:
# and do some generation. The output should be different every time you run this cell 
# and of all sorts of topics and styles
prompt ="once upon a time there was a little mouse" 
print(generate_test_text(model,input_text=prompt, max_length=256))

once upon a time there was a little mouse in the back and it was so long that it took more than a minute to open it.
After you first opened the door the door opened and there was a long wooden door behind the door where the mouse was sitting. You could see the mouse sitting around as he tried to make sure he was sitting on a table to check what his body was doing.
He turned and looked at him, and he looked down at the boy who was being held against the wall by the mouse and the boy who had suddenly entered on that one.
"It is not all the same," he said and the boy slowly put it to the table. It was now the boy who had been in a position where his arms were getting stronger and so the boy sat and looked at him with his hands over his mouth.
"Did you hear the voice?"
"I am still trying to understand. It is quite strange. You say you never had a chance to be human and after a while no one can even talk to you about it. It doesn't seem to be true, even in this day and age, but I have exper

In [17]:

# make scheduler (for varying learning rate over time)
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)

# AFTER
# This is the link to the gradient accumulation documentation
# https://pytorch.org/docs/stable/notes/amp_examples.html#gradient-accumulation# 

#TODO: look into gradient checkpointing as well

epoch_loss = 0.0 # used to track loss for each epoch

internal_batch_count = 0 # used to track number of examples within each batch. This is necessary 
                         # because of the gradient accumulation hack (to deal with my 8GB GPU memory)
    
# make FP16 scaler for faster training
scaler = torch.cuda.amp.GradScaler()
    
# put the model into training mode
model.train()

for epoch in range(N_EPOCHS): # iterate over epochs
    
    print("started epoch {}".format(epoch))
    
    for idx, text in enumerate(story_dataloader):  # this data loader is set up to shuffle automatically

        # Do the forward propagation. 
        # Don't forget to put the text onto the gpu.
        # you have to put in labels to get the loss as an output.
        # because GPT is an autogregessive model the input is the output for training purposes
        
        with torch.cuda.amp.autocast():
            outputs = model(text.to(device), labels=(text.to(device)))
        
            # get the loss out so we can do backwards propagation
            loss, logits = outputs[:2]
            loss = loss / BATCH_SIZE 
        
        # do backpropagation. yay autodifferentiation!
        # note the use of the scaler for the FP16 
        scaler.scale(loss).backward()
        
        # keep track of the loss
        epoch_loss = epoch_loss + loss.detach().cpu().numpy()  # need to detach the gradients 
                                                               # because we only care about the numerical value
                                                               # also store the epoch loss on the cpu as numpy
            
        # increment the internal_batch_count
        internal_batch_count = internal_batch_count + 1
        
        # Now, if we have run through a full batch, take some optimizer and gradient steps
        if internal_batch_count == BATCH_SIZE:
            internal_batch_count = 0 # reset this
            
            # take an optimizer step. note the use of the scaler for FP16
            scaler.step(optimizer) 
            scaler.update()
            optimizer.zero_grad() # zero out the gradients in the optimizer
            
            model.zero_grad() # zero out the gradients we've been accumulating in the model
            
            scheduler.step() # take a scheduler step

     # Now that we've gone through an epoch, let's see what the loss is and what some generated text looks like
    
    # put the model into evaluation mode
    model.eval()
    
    # print the loss
    print("Epoch {} has loss {}".format(epoch, epoch_loss))
    # reset the loss
    epoch_loss = 0.0
    
    # uncomment this if you want to print some test text after each epoch
    #print(generate_test_text(model,input_text=prompt))
    
    
    # put the model back in training mode
    model.train()

started epoch 0




Epoch 0 has loss 13455.683310374618
started epoch 1
Epoch 1 has loss 13352.844423286617
started epoch 2
Epoch 2 has loss 13353.959836542606
started epoch 3
Epoch 3 has loss 13346.500084221363
started epoch 4
Epoch 4 has loss 13358.591995038092


In [50]:
def strip_paragraph(paragraph, sentence=6):
    sent_count = 0
    output_str = ""
    for i in range(len(paragraph)):
        output_str += paragraph[i]
        if paragraph[i] == '.' and paragraph[i+1] == ' ':
            sent_count += 1
        if sent_count == 5:
            break
    return output_str



In [55]:

test_ls = np.loadtxt(data_path+"test-index.txt")
train_array = train_stories.values[:,1:].reshape(-1).tolist()
with open(data_path+"output/gpt2-generated-story.txt","w") as file:
    for i in range(20):
        output_text = ""
        index = int(test_ls[i])
        prompt = train_array[index] 
        text = str(generate_test_text(model,input_text=prompt, max_length=256)).replace('\\','').replace('rn',' ')
        text = strip_paragraph(text,sentence=6)
        output_text += "Original story: " + prompt + "\n"
        output_text += "Generated story: " + text + "\n\n"
        file.write(output_text)
        print(output_text)
file.close()



Original story: We had no reservations and had to sit at the bar.
Generated story: We had no reservations and had to sit at the bar. A friend of mine worked a lot harder with a new artist. My girlfriend wanted a shot of my new piece. The girl was an experienced artist. She got a great story from our local artists.


Original story: I dreamed my friend's daughter broke up with her boyfriend.
Generated story: I dreamed my friend's daughter broke up with her boyfriend. I went to the doctor and got the surgery. After the surgery, she was so happy. My husband took up the exam at the University of Georgia in 2008. His father gave a test.


Original story: Everyone told me to try it.
Generated story: Everyone told me to try it. I‍It was very difficult. I woke up to get nervous and tired and woke up the next mo ing. I knew that I couldn't get out of bed because of the music. I realized that my friend had gone to work.


Original story: Neither one of them could get the nerve to throw a punch.
