In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 19.6 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 58.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 53.8 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 7.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 58.8 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml

In [3]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
import numpy as np
import pandas as pd 

import os
import re
import string
import random
import time
import datetime

from collections import Counter
import itertools
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
plt.style.use('bmh')

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, random_split, DataLoader, RandomSampler, SequentialSampler

pd.set_option('display.max_rows', 100)

In [5]:
poem_line_df = pd.read_csv('/content/drive/MyDrive/11785/poe-master/poe_poems_lines.csv')
poem_line_df = poem_line_df.fillna('')

In [6]:
poem_line_df

Unnamed: 0,title,line_text
0,The Raven,"Once upon a midnight dreary, while I pondered,..."
1,The Raven,Over many a quaint and curious volume of forgo...
2,The Raven,"While I nodded, nearly napping, suddenly there..."
3,The Raven,As of some one gently rapping—rapping at my ch...
4,The Raven,"""'Tis some visitor,"" I muttered, ""tapping at m..."
...,...,...
2494,The Forest Reverie,Deep in the heart whose hope has died—
2495,The Forest Reverie,"Quenching the fires its ashes hide,—"
2496,The Forest Reverie,"Its ashes, whence will spring and grow"
2497,The Forest Reverie,"Sweet flowers, ere long,—"


In [7]:
n = 301348 #number of records in file
s = 5000 #desired sample size
skip = sorted(random.sample(range(1,n),n-s))
jeff_csv=pd.read_csv('/content/drive/MyDrive/11785/data.csv',delimiter='\t',header=0,on_bad_lines='skip',skiprows=skip)
jeff_csv=jeff_csv.fillna('')

In [None]:
jeff_csv

In [8]:
RANDOM_SEED = 73
BATCH_SIZE = 5

EPOCHS = 4
SAMPLE_EVERY = 100

MAX_SEQUENCE_LENGTH = 10

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

special_tokens_dict = {'bos_token': '<BOS>', 'eos_token': '<EOS>', 'pad_token': '<PAD>'}
num_added_tokens = tokenizer.add_special_tokens(special_tokens_dict)

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [10]:
class PoePoemDataset(Dataset):
    
    def __init__(self, data, tokenizer, gpt2_type='gpt2', max_length=MAX_SEQUENCE_LENGTH):
        self.tokenizer = tokenizer
        self.input_ids = []
        self.attn_masks = []
        self.topics=[]
        
        for i in data:
            encodings_dict = tokenizer('<BOS>' + i[1] + '<EOS>',
                                     truncation=True,
                                     max_length=max_length,
                                     padding='max_length'
                                    )

            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))
            self.topics.append(i[0])


    def __len__(self):
        
        return len(self.input_ids)
    
    def __getitem__(self, idx):
        
        return self.input_ids[idx], self.attn_masks[idx]
        

In [11]:
jeff_csv_dataset=PoePoemDataset(jeff_csv.values, tokenizer, max_length=MAX_SEQUENCE_LENGTH)

In [12]:
def train_val_split(split, dataset):
    train_size = int(split * len(dataset))
    val_size = len(dataset) - train_size
    return train_size, val_size

In [13]:
jeff_train_size, jeff_val_size = train_val_split(0.8, jeff_csv_dataset)

# random split imported from troch.utils
jeff_train_dataset, jeff_val_dataset = random_split(jeff_csv_dataset, [jeff_train_size, jeff_val_size])

In [14]:
torch.cuda.manual_seed_all(RANDOM_SEED)
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

<torch._C.Generator at 0x7fc51a610650>

In [15]:
def create_dataloaders(train_dataset, val_dataset, bs):
    train_dataloader = DataLoader(train_dataset,
                                  sampler=RandomSampler(train_dataset),
                                  batch_size=bs)

    val_dataloader = DataLoader(val_dataset,
                                sampler=SequentialSampler(val_dataset),
                                batch_size=bs)
    
    return train_dataloader, val_dataloader

In [16]:
jeff_train_dataloader, jeff_val_dataloader = create_dataloaders(jeff_train_dataset, jeff_val_dataset, BATCH_SIZE)

In [17]:
configuration = GPT2Config(vocab_size=len(tokenizer), n_positions=MAX_SEQUENCE_LENGTH).from_pretrained('gpt2', output_hidden_states=True)

In [18]:
# helper function for logging time
def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))

In [19]:
learning_rate = 5e-4
eps = 1e-8
warmup_steps = 1e2

In [20]:
device = torch.device('cuda')
# device = torch.device('cpu')
prompt = "<BOS>"
generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
generated = generated.to(device)

In [21]:
def create_model(train_dataloader, val_dataloader, file_name):

    model = GPT2LMHeadModel.from_pretrained('gpt2', config=configuration)
    model.resize_token_embeddings(len(tokenizer))
    
    model.cuda()
    optimizer = AdamW(model.parameters(), lr=learning_rate, eps=eps)

    total_steps = len(train_dataloader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_steps,
                                                num_training_steps=total_steps)
    
    total_t0 = time.time()
    model = model.to(device)

    for epoch_i in range(0, EPOCHS):

        print(f'Epoch {epoch_i + 1} of {EPOCHS}')

        t0 = time.time()
        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_masks = batch[1].to(device)

            model.zero_grad()        

            outputs = model(b_input_ids,
                            labels=b_labels, 
                            attention_mask=b_masks,
                            token_type_ids=None)

            loss = outputs[0]  

            batch_loss = loss.item()
            total_train_loss += batch_loss

            if step % SAMPLE_EVERY == 0 and step != 0:
                
                model.eval()
                sample_outputs = model.generate(
                                        generated,
                                        do_sample=True,   
                                        top_k=50, 
                                        max_length=200,
                                        top_p=0.95, 
                                        num_return_sequences=1
                                    )
                for i, sample_output in enumerate(sample_outputs):
                      print(f'Example output: {tokenizer.decode(sample_output, skip_special_tokens=True)}')

                model.train()

            loss.backward()
            optimizer.step()
            scheduler.step()

        avg_train_loss = total_train_loss / len(train_dataloader)       
        training_time = format_time(time.time() - t0)

        print(f'Average Training Loss: {avg_train_loss}. Epoch time: {training_time}')

        t0 = time.time()

        print('Evaluating Model')

        model.eval()

        total_eval_loss = 0
        nb_eval_steps = 0

        for batch in val_dataloader:
            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_masks = batch[1].to(device)

            with torch.no_grad():        

                outputs  = model(b_input_ids,  
                                 attention_mask=b_masks,
                                 labels=b_labels)

                loss = outputs[0]  

            batch_loss = loss.item()
            total_eval_loss += batch_loss        

        avg_val_loss = total_eval_loss / len(val_dataloader)

        validation_time = format_time(time.time() - t0)    

        print(f'Validation loss: {avg_val_loss}. Validation Time: {validation_time}')

    print(f'Total training took {format_time(time.time()-total_t0)}')

    torch.save(model.state_dict(), file_name)
    return model

In [22]:
jeff_model = create_model(jeff_train_dataloader, jeff_val_dataloader, 'jeff_4_epoch.pth')

Downloading:   0%|          | 0.00/523M [00:00<?, ?B/s]



Epoch 1 of 4


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: There's a lot of smoke, in a boat, there's an addiction-filled world,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: A few minutes I'd finally get in my place.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: For a girl's heart was in her eye,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: The first item in the group of the group is the thing


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: Is this some kind we're living in?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: I've got the maven's eyes so my ears aren't welling.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: I'd make that point as far as I could.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: I'm not a fan of the French.
Average Training Loss: 6.068842384740889. Epoch time: 0:01:21
Evaluating Model
Validation loss: 5.036484487889486. Validation Time: 0:00:03
Epoch 2 of 4


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: That's a corpse I'll behead.  "From Hell?"


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: A young knight in the king's comitatus.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: The force that defines his skill?


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: But she is no help at all,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: In the end he was dead."


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: That the things they do,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: What was this thing? I thought he was an old cow.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: When a child grows up in this nation.
Average Training Loss: 3.8810406034227185. Epoch time: 0:01:20
Evaluating Model
Validation loss: 5.42541269024023. Validation Time: 0:00:02
Epoch 3 of 4


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: Though I'm a sociable gal,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: As you piously stroll through Hell,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: I believe in free will and opportunity,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: On the day that she opens up,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: And it's painfully hot for me.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: I'm a nursing assistant, so I'm not optimistic.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: And they're told they're leaving the tent.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: A chef de cuisine (this cuisine is famed),
Average Training Loss: 2.6870065658784315. Epoch time: 0:01:21
Evaluating Model
Validation loss: 6.000744400982652. Validation Time: 0:00:03
Epoch 4 of 4


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: On the farm that's where he lives.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: If your bum's fat (got fat when chipped).


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: To extremes that are near arthropodic.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: To the logical logical goo


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: With technologyyes sir, but not God's Sonyes sir,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: With its front wheel facing down,


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: The clerk, a man with no clue


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Example output: This is the place where I'm from (USA).
Average Training Loss: 1.7188781321191673. Epoch time: 0:01:21
Evaluating Model
Validation loss: 7.0314850145549865. Validation Time: 0:00:03
Total training took 0:05:33


In [25]:
jeff_model.eval()

sample_outputs = jeff_model.generate(
                                generated, 
                                do_sample=True,   
                                top_k=50, 
                                max_length=512,
                                top_p=0.95, 
                                num_return_sequences=5
                                )

for i, sample_output in enumerate(sample_outputs):
    print("{}: {}\n\n".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0: But for that matter,


1: (An offense he thought clever)


2: By a Japanese mayor,


3: Then I died — saw God's light.


4: That's just rhyming slang.


