In [2]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 5.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 41.5 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 44.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 9.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 5.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
   

In [3]:
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from datasets import load_dataset
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv    

In [4]:
gpt2_version = 'distilgpt2'

In [5]:
### Prepare data
data = pd.read_csv('/content/drive/MyDrive/NIR/2022_spring/Shakespeare_data.csv')
test_set = data.sample(n = int(0.2*data.shape[0]))
data = data.loc[~data.index.isin(test_set.index)]
test_set = test_set.reset_index()
data = data.reset_index()

In [6]:
class ShakespeareLyrics(Dataset):  
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024, col_name='PlayerLine'):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []

        for row in data[col_name]:
          self.lyrics.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))               
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]
    
dataset = ShakespeareLyrics(data['PlayerLine'], truncate=True, gpt2_type=gpt2_version, col_name='PlayerLine') 

Downloading:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/762 [00:00<?, ?B/s]

In [7]:
#Get the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(gpt2_version)
model = GPT2LMHeadModel.from_pretrained(gpt2_version)

#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None

Downloading:   0%|          | 0.00/336M [00:00<?, ?B/s]

In [8]:
def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=5, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type=gpt2_version, output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):
    print('------------------ START ------------------')
    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    print('------------------ END ------------------')
    return model


In [9]:
model = train(dataset, model, tokenizer)

------------------ START ------------------




Training epoch 0
0


20000it [32:23, 10.29it/s]


Training epoch 1
tensor(0.2398, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [32:21, 10.30it/s]


Training epoch 2
tensor(0.3181, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [32:22, 10.30it/s]


Training epoch 3
tensor(0.3326, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [32:22, 10.29it/s]


Training epoch 4
tensor(0.2504, device='cuda:0', grad_fn=<NllLossBackward0>)


20000it [32:23, 10.29it/s]

------------------ END ------------------





In [15]:
from datasets import load_dataset
from tqdm import tqdm

def calc_ppl(curr_model, curr_tokenizer, curr_device):
    encodings = curr_tokenizer('\n\n'.join(test_set['PlayerLine']), return_tensors='pt')

    max_length = curr_model.config.n_positions
    stride = 512

    nlls = []
    for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, encodings.input_ids.size(1))
        trg_len = end_loc - i 
        input_ids = encodings.input_ids[:, begin_loc:end_loc].to(curr_device)
        target_ids = input_ids.clone()
        target_ids[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = curr_model(input_ids, labels=target_ids)
            neg_log_likelihood = outputs[0] * trg_len

        nlls.append(neg_log_likelihood)

    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)
    return ppl

In [16]:
device = 'cuda'
ppl_mymodel = calc_ppl(curr_model=model, curr_tokenizer=tokenizer, curr_device=device)
print('MY MODEL PPL: ', ppl_mymodel)

100%|██████████| 530/530 [01:14<00:00,  7.10it/s]


MY MODEL PPL:  tensor(217.7498, device='cuda:0')


In [17]:
device = 'cuda'
model = GPT2LMHeadModel.from_pretrained(gpt2_version).to(device)
tokenizer = GPT2TokenizerFast.from_pretrained(gpt2_version)
ppl_not_fune_tuning_model = calc_ppl(curr_model=model, curr_tokenizer=tokenizer, curr_device=device)
print('ANOTHER MODEL PPL: ', ppl_not_fune_tuning_model)

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (271112 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 530/530 [01:11<00:00,  7.42it/s]


ANOTHER MODEL PPL:  tensor(174.2802, device='cuda:0')
