## Testing GPT2 Model
- https://huggingface.co/gpt2
- https://huggingface.co/docs/accelerate/usage_guides/training_zoo

In [1]:
!pip install --upgrade pip
!pip install transformers
!pip install torch
!pip install evaluate
!pip install tqdm
!pip install accelerate



In [1]:
import os
from accelerate.utils import write_basic_config

write_basic_config()  # Write a config file
os._exit(00)  # Restart the notebook

  from .autonotebook import tqdm as notebook_tqdm


: 

: 

In [1]:
from transformers import pipeline, set_seed

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
generator = pipeline('text-generation', model='gpt2')
set_seed(42)
generator("Hello, I'm a language model,", max_length=50, num_return_sequences=5)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': "Hello, I'm a language model, I'm writing a new language for you. But first, I'd like to tell you about the language itself. It is not the Python, but rather: Python is the open source language, inspired by Python"},
 {'generated_text': "Hello, I'm a language model, and I'm trying to be as expressive as possible. In order to be expressive, it is necessary to know how to represent our language. In order to know how to express a model, though, my code"},
 {'generated_text': "Hello, I'm a language model, so I don't get much of a license anymore, but I'm probably more familiar with other languages on that front, and the compiler that comes with them was just sort of a mess.\n\nPorter"},
 {'generated_text': "Hello, I'm a language model, a functional model... It's not me, it's me!\n\nI won't bore you with how it works.\n\nYou hear me when you write to me.\n\nWell, I'm"},
 {'generated_text': "Hello, I'm a language model, not an object model.\n\nIn a nutshell, I need to giv

### Finetuning the Model

In [3]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from torch.utils.data import Dataset, DataLoader, TensorDataset

In [4]:
# Load the pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

special_tokens_dict = {'pad_token': '<PAD>'}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

# model = GPT2LMHeadModel.from_pretrained(model_name)

#### Tokenize the dataset

In [None]:
# Define constants
MAX_LENGTH = 512
MODEL_NAME = 'gpt2'
FILE_PATH = './data/calregs.txt'

# Define dataset class
class RegulationsDataset(Dataset):
    def __init__(self, file_path, tokenizer):
        self.tokenizer = tokenizer
        self.input_ids = []

        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()
            # Split text into sentences
            sentences = [s.strip() for s in text.split('.') if len(s) > 0]
            for sentence in sentences:
                if (sentence):
                    # Encode sentence as input_ids and truncate to max length
                    encoded = tokenizer.encode(sentence, max_length=MAX_LENGTH, truncation=True)
                    self.input_ids.append(torch.tensor(encoded))
    
    def __len__(self):
        return len(self.input_ids)
    def __getitem__(self, idx):
        return self.input_ids[idx]


# Define collate function
def collate_fn(batch):
    # Pad batch to max length
    input_ids = torch.nn.utils.rnn.pad_sequence(batch, batch_first=True, padding_value=0)
    # Create attention mask
    attention_mask = torch.where(input_ids != 0, torch.tensor(1), torch.tensor(0))
    return {'input_ids': input_ids, 'attention_mask': attention_mask}


#### Training Parameters

In [None]:
from transformers import get_scheduler

# Define the training parameters
BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 1e-5

# Create a PyTorch DataLoader for batching the input-output pairs
def get_dataloader(batch_size: int = 64):
    dataset = RegulationsDataset(FILE_PATH, tokenizer=tokenizer)

    data_loader = DataLoader(
        # TensorDataset(input_seqs),
        dataset=dataset,
        batch_size=BATCH_SIZE,
        collate_fn=collate_fn,
        shuffle=True,
    )

    return data_loader

#### Optimizer and Loss Function

In [None]:
NUM_TRAINING_STEP = BATCH_SIZE * len(data_loader)

# Define the optimizer and move it to the specified device
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Define loss function
loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id)

# Create a default learning rate scheduler
lr_scheduler = get_scheduler(
    name="linear", 
    optimizer=optimizer, 
    num_warmup_steps=0, 
    num_training_steps=NUM_TRAINING_STEP
)

#### Ready to Train

Accelerate is a library that enables the same PyTorch code to be run across any distributed configuration. In short, training and inference at scale made simple, efficient and adaptable.

In [5]:
from accelerate import Accelerator
from tqdm.auto import tqdm
# accelerator = Accelerator()

In [None]:
def training_loop(mixed_precision="fp16", seed: int = 42, batch_size: int = 64):
    set_seed(seed)
    # Initialize accelerator
    accelerator = Accelerator(mixed_precision=mixed_precision)

    # Build Dataloader
    data_loader = get_dataloader(batch_size)

    # Create Model
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Freeze the base model
    for param in model.parameters():
        param.requires_grad = False
    for param in model.get_classifier().parameters():
        param.requires_grad = True

    # You can normalize the batches of images to be a bit faster
    mean = torch.tensor(model.default_cfg["mean"])[None, :, None, None]
    std = torch.tensor(model.default_cfg["std"])[None, :, None, None]

    # To make these constants available on the active device, set it to the accelerator device
    mean = mean.to(accelerator.device)
    std = std.to(accelerator.device)

    # Intantiate the optimizer
    optimizer = torch.optim.Adam(params=model.parameters(), lr=3e-2 / 25)

    # Instantiate the learning rate scheduler
    # lr_scheduler = OneCycleLR(optimizer=optimizer, max_lr=3e-2, epochs=5, steps_per_epoch=len(data_loader))
    num_training_steps = batch_size * len(data_loader)
    lr_scheduler = get_scheduler(
        name="linear", 
        optimizer=optimizer, 
        num_warmup_steps=0, 
        num_training_steps=num_training_steps
    )

    # Prepare everything
    # There is no specific order to remember, you just need to unpack the objects in the same order you gave them to the
    # prepare method.
    model, optimizer, data_loader, eval_dataloader, lr_scheduler = accelerator.prepare(
        model, optimizer, data_loader, eval_dataloader, lr_scheduler
    )

    progress_bar = tqdm(range(num_training_steps))

    for epoch in range(EPOCHS):
        model.train()
        running_loss = 0.0
        for batch in data_loader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']

            # Forward pass
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)

            # compute the loss
            loss = loss_fn(outputs.logits.view(-1, tokenizer.vocab_size), input_ids.view(-1))

            # Backward pass        
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            progress_bar.update(1)
            running_loss += loss.item() * input_ids.size(0)
            
        epoch_loss = running_loss / len(input_ids)
        print(f'Epoch {epoch+1}/{EPOCHS}: loss={epoch_loss:.4f}')

In [None]:
# specify device to use a GPU if you have access to one. Otherwise, 
# training on a CPU may take several hours instead of a couple of minutes.
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = accelerator.device

# move the model to the specified device
model.to(device)

In [None]:
# Preparing the objects related to the training prepare()
model, optimizer, data_loader, lr_scheduler = accelerator.prepare(
    model, optimizer, data_loader, lr_scheduler
)

In [None]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(NUM_TRAINING_STEP))

for epoch in range(EPOCHS):
    model.train()
    running_loss = 0.0
    for batch in data_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        # Forward pass
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=input_ids)

        # compute the loss
        loss = loss_fn(outputs.logits.view(-1, tokenizer.vocab_size), input_ids.view(-1))

        # Backward pass        
        accelerator.backward(loss)
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        progress_bar.update(1)
        running_loss += loss.item() * input_ids.size(0)
        
    epoch_loss = running_loss / len(input_ids)
    print(f'Epoch {epoch+1}/{EPOCHS}: loss={epoch_loss:.4f}')

In [None]:
# Define function to generate responses
def generate_response(text):
    input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
    output = model.generate(input_ids=input_ids, max_length=MAX_LENGTH, do_sample=True, temperature=0.7)
    response = tokenizer.decode(output[0], skip_special_tokens=True)
    return response

In [None]:
# Test the model
while True:
    text = input('User: ')
    response = generate_response(text)
    print(f'Bot:', response)

#### Evaluate

In [None]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()

for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
del model
del trainer
torch.cuda.empty_cache()

### AWS SageMaker Training Job