In [1]:
from pathlib import Path

In [26]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

In [3]:
DATA_PATH = Path("/mnt/c/Users/nikol/Projects/gpt2test/data/tinyshakespeare.txt")

In [7]:
texts = DATA_PATH.open("r").readlines()
texts[:10]

['First Citizen:\n',
 'Before we proceed any further, hear me speak.\n',
 '\n',
 'All:\n',
 'Speak, speak.\n',
 '\n',
 'First Citizen:\n',
 'You are all resolved rather to die than to famish?\n',
 '\n',
 'All:\n']

### Test inference

In [29]:
# Get tokenizer for gpt2
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [30]:
# Load model, use language model head
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [35]:
# Tokenize text and return pytorch tensor
text = "Be or not to be, thats a question."
tokenizer_out = tokenizer(text, return_tensors='pt')
input_ids = tokenizer_out["input_ids"]
attention_mask = tokenizer_out["attention_mask"]

In [37]:
# Use inference
output = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=1000, do_sample=True)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [42]:
# Decode tokens
out = tokenizer.decode(output[0], skip_special_tokens=True)
print(" ".join([line.strip() for line in out.split()]))

Be or not to be, thats a question. It's like how people get paid? It's like you can never get paid, right? If you were to buy a car in your 20s to 20s, do you think there would be any downside to that? It's the same as buying a house. How does that affect you personally? No. That's the thing. Because you think once you're dead, you don't want to live. So you go, 'Who are you?' People say, 'When you are dead, you don't want to live.' We're like, 'What is your life to living?' You never answer those questions, so you don't know. You feel you've told people that for one reason or another, but I think that's the key thing is that, when you've passed away, how do you know? People don't know how to live out their lives or how to live back up with, to be honest, their kids, their grandkids—all of that is a massive responsibility. Advertisement So I guess that's my point. I really don't want to be there any more. Yeah. Advertisement Have you ever talked about a family-wise philosophy of life? 

### Prototype fine-tuning

In [1]:
from pathlib import Path
from dataclasses import dataclass

from tqdm.auto import tqdm
import torch
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, get_scheduler

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
@dataclass
class TrainingParams:
    pretrained_model_name_or_path: str
    
    data_path: Path
    checkpoints_dir: Path

    num_train_epochs: int
    batch_size: int
    learning_rate: float

In [5]:
gpt_finetune_shakespeare = TrainingParams(
    pretrained_model_name_or_path="gpt2",
    data_path=Path("/mnt/c/Users/nikol/Projects/gpt2test/data/tinyshakespeare.txt"),
    checkpoints_dir=Path("./checkpoints"),
    num_train_epochs=5,
    batch_size=8,
    learning_rate=2e-5,)

In [6]:
class Trainer:
    def __init__(self, params: TrainingParams):
        self._params = params
        
        self._tokenizer = GPT2Tokenizer.from_pretrained(
            pretrained_model_name_or_path=params.pretrained_model_name_or_path)
        
        # Init dataset
        self._dataset = TextDataset(tokenizer=self._tokenizer, file_path=params.data_path, block_size=128)
                
        # Init data loaders
        self._train_dataloader = DataLoader(self._dataset, batch_size=params.batch_size)
    
    @property
    def tokenizer(self) -> GPT2Tokenizer:
        return self._tokenizer
    
    @property
    def num_batches(self) -> int:
        return len(self._train_dataloader)
    
    @property
    def data_size(self) -> int:
        return len(self._dataset)
    
    def train(self, model: GPT2LMHeadModel) -> tuple[GPT2LMHeadModel, int, Path]:
        # Init optimizer
        optimizer = AdamW(model.parameters(), lr=self._params.learning_rate)
        
        # Init scheduler
        num_epochs = self._params.num_train_epochs
        num_training_steps = num_epochs * len(self._train_dataloader)
        progress_bar = tqdm(range(num_training_steps))
        lr_scheduler = get_scheduler(
            name="linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps
        )
        
        # Load to device
        device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        model = model.to(device)
        
        # Train loop
        model.train()
        
        for epoch in range(num_epochs):
            for batch in self._train_dataloader:
                # batch = {k: v.to(device) for k, v in batch.items()}
                
                batch = batch.to(device)
                outputs = model(batch, labels=batch)
                loss = outputs.loss
                loss.backward()
                
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)
            
            print(f"Epoch: {epoch} loss: {loss.cpu().detach().numpy()}")
            torch.save(model.state_dict(), self._params.checkpoints_dir / f"gpt2s-{epoch}.pt",)
        
        last_ckpt = self._params.checkpoints_dir / f"gpt2s-{epoch}.pt"
        return model, loss.cpu().detach().numpy(), last_ckpt

In [7]:
model = GPT2LMHeadModel.from_pretrained(
    pretrained_model_name_or_path=gpt_finetune_shakespeare.pretrained_model_name_or_path)

In [8]:
trainer = Trainer(params=gpt_finetune_shakespeare)
print(trainer.data_size, trainer.num_batches)

2640 330




In [9]:
model, loss, last_ckpt = trainer.train(model=model)

 20%|██████████▊                                           | 330/1650 [01:17<04:50,  4.54it/s]

Epoch: 0 loss: 3.5530896186828613


 40%|█████████████████████▌                                | 660/1650 [02:30<03:35,  4.59it/s]

Epoch: 1 loss: 3.406376600265503


 60%|████████████████████████████████▍                     | 990/1650 [03:44<02:26,  4.52it/s]

Epoch: 2 loss: 3.3193750381469727


 80%|██████████████████████████████████████████▍          | 1320/1650 [04:57<01:13,  4.50it/s]

Epoch: 3 loss: 3.27297306060791


100%|█████████████████████████████████████████████████████| 1650/1650 [06:11<00:00,  4.53it/s]

Epoch: 4 loss: 3.266934633255005


100%|█████████████████████████████████████████████████████| 1650/1650 [06:12<00:00,  4.43it/s]


In [10]:
# Test fine-tuned model
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path=gpt_finetune_shakespeare.pretrained_model_name_or_path)
# Tokenize text and return pytorch tensor
text = "Be or not to be, thats a question."
tokenizer_out = tokenizer(text, return_tensors='pt')
input_ids = tokenizer_out["input_ids"]
attention_mask = tokenizer_out["attention_mask"]

In [11]:
model_fine_tuned = GPT2LMHeadModel.from_pretrained(
    pretrained_model_name_or_path=gpt_finetune_shakespeare.pretrained_model_name_or_path,
    state_dict=torch.load(last_ckpt),
)

In [12]:
output = model_fine_tuned.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=1000,
    do_sample=True,
)
out = tokenizer.decode(output[0], skip_special_tokens=True)
print(" ".join([line.strip() for line in out.split()]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Be or not to be, thats a question. I do think that's your business, and you should be my husband. How was I made your master? MESSAGE: If you make any sort of acquaintance with me, I must tell you myself, 'tis my husband; he shall not lie with you in such a matter, for I'll think him good for it. HENRY BOLINGBROKE: No, you know me well enough to believe this: it was not done till my father, George Edward Blount, came to England with you, for my father's sake. Away with me: I have to hear him speak. 'Tis his way to the king, though he is too young to live, for he is like a good horse and a noble hatter; nor so with me; but, as the king says,'s not far off' The Duke of York will send him a present, for he is the elder and more gentle Richard's sister; not so with myself: not as you mean, but as a good man with a wife to be married; an Englishwoman is very mean and sweet indeed, and will live; therefore for if she should live to-morrow To-day, she could not come a fortnight without your w