## Testing GPT2 Model
https://huggingface.co/gpt2

In [7]:
!pip install --upgrade pip
!pip install transformers
!pip install torch
!pip install evaluate
!pip install tqdm



In [1]:
from transformers import pipeline, set_seed


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=50, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model, I'm writing a new language for you. But first, I'd like to tell you about the language itself. It is not the Python, but rather: Python is the open source language, inspired by Python"},
 {'generated_text': "Hello, I'm a language model, and I'm trying to be as expressive as possible. In order to be expressive, it is necessary to know how to represent our language. In order to know how to express a model, though, my code"},
 {'generated_text': "Hello, I'm a language model, so I don't get much of a license anymore, but I'm probably more familiar with other languages on that front, and the compiler that comes with them was just sort of a mess.\n\nPorter"},
 {'generated_text': "Hello, I'm a language model, a functional model... It's not me, it's me!\n\nI won't bore you with how it works.\n\nYou hear me when you write to me.\n\nWell, I'm"},
 {'generated_text': "Hello, I'm a language model, not an object model.\n\nIn a nutshell, I need to giv

### Finetuning the Model

In [12]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader

In [17]:
# Load the pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

model = GPT2LMHeadModel.from_pretrained(model_name)

#### Tokenize the dataset

In [30]:
# do not run this
# Load your dataset
with open('./data/calregs.txt', 'r', encoding='utf-8') as f:
    data = f.read()

# Tokenize the dataset
encoded_data = tokenizer.encode(data, max_length=1024, truncation=True)

# Create input-output pairs for training
seq_len = model.config.n_positions
input_seqs = []
label_seqs = []
for i in range(0, len(encoded_data), seq_len):
    input_seqs.append(encoded_data[i:i+seq_len])
    label_seqs.append(encoded_data[i+1:i+seq_len+1])

# Convert input-output pairs to PyTorch tensors
input_seqs = torch.tensor(input_seqs)
label_seqs = torch.tensor(label_seqs)

# Create a PyTorch DataLoader for batching the input-output pairs
batch_size = 4
data_loader = torch.utils.data.DataLoader(
    torch.utils.data.TensorDataset(input_seqs, label_seqs),
    batch_size=batch_size,
    shuffle=True
)


In [31]:
# Define your custom dataset class
class CustomDataset(Dataset):
    def __init__(self, file_path):
        with open(file_path, 'r', encoding='utf-8') as f:
            self.data = f.readlines()

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data[idx].strip()
        input_ids = tokenizer.encode(text, add_special_tokens=True, padding='max_length', max_length=1024, truncation=True)
        label_ids = input_ids.copy()
        label_ids.pop(0)
        label_ids.append(tokenizer.eos_token_id)
        return torch.tensor(input_ids), torch.tensor(label_ids)

# Define your data loader
batch_size = 4
train_dataset = CustomDataset('./data/calregs.txt')

train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
# dont have the eval dataset yet
eval_dataloader = DataLoader(train_dataset, batch_size=batch_size)

#### Define Training Parameters

In [32]:
from transformers import get_scheduler

# Define the training parameters
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
learning_rate = 1e-5

# Define the optimizer and the loss function
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Create a default learning rate scheduler
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

# specify device to use a GPU if you have access to one. Otherwise, 
# training on a CPU may take several hours instead of a couple of minutes.
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

#### Ready to Train

In [37]:
for i, (inputs, labels) in enumerate(train_dataloader):
    print(inputs)
    print(labels)

IndexError: pop from empty list

In [28]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
#     for batch in train_dataloader:
#         batch = {k: v.to(device) for k, v in batch.items()}
    for i, (inputs, labels) in enumerate(train_dataloader):
        inputs = inputs.to(model.device)
        labels = labels.to(model.device)
        outputs = model(inputs, labels=labels)
        # outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

# for epoch in range(num_epochs):
#     model.train()
#     for i, (inputs, labels) in enumerate(train_dataloader):
        # inputs = inputs.to(model.device)
        # labels = labels.to(model.device)
        # outputs = model(inputs, labels=labels)
#         loss = criterion(outputs.logits.view(-1, outputs.logits.size(-1)), labels.view(-1))
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         if (i+1) % 100 == 0:
#             print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'.format(epoch+1, num_epochs, i+1, len(train_dataloader), loss.item()))

  0%|          | 0/4263 [02:39<?, ?it/s]


AttributeError: 'list' object has no attribute 'items'

#### Evaluate

In [None]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
del model
del trainer
torch.cuda.empty_cache()

### AWS SageMaker Training Job