In [1]:
import transformers
# https://www.it-jim.com/blog/training-and-fine-tuning-gpt-2-and-gpt-3-models-using-hugging-face-transformers-and-openai-api/


MODEL_NAME = 'gpt2'

In [3]:
pipe = transformers.pipeline(task='text-generation', model=MODEL_NAME, device='cpu')
print(pipe('The elf queen'))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The elf queen, the two red, gold-haired creatures that would go to the top of the arena, looked on with confusion. "But our king, our lord has just arrived!"\n\nThis time, they couldn\'t predict the outcome or'}]


In [5]:
model = transformers.GPT2LMHeadModel.from_pretrained(MODEL_NAME)
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

enc = tokenizer(['The elf queen'], return_tensors='pt')
print('enc =', enc)
print(tokenizer.batch_decode(enc['input_ids']))

enc = {'input_ids': tensor([[  464, 23878, 16599]]), 'attention_mask': tensor([[1, 1, 1]])}
['The elf queen']


In [None]:
config = transformers.GPT2Config.from_pretrained(MODEL_NAME)
config

In [8]:
config.do_sample = config.task_specific_params['text-generation']['do_sample']
config.max_length = config.task_specific_params['text-generation']['max_length']
model = transformers.GPT2LMHeadModel.from_pretrained(MODEL_NAME, config=config)

In [10]:
pipe = transformers.pipeline(task='text-generation', model=MODEL_NAME, device='cpu')
print(pipe('The elf queen'))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[{'generated_text': 'The elf queen, an orphan boy with huge head and big hands, was about to become a godmother! When she saw my little fellow Elfling I knew something was going on as we watched her take one of our small children to a private high'}]


## Train

In [2]:
model = transformers.GPT2LMHeadModel.from_pretrained(MODEL_NAME)
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

2024-03-16 03:25:37.739165: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-16 03:25:37.787484: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
# By IT-JIM, 2023
# Train GPT-2 with PyTorch (no Trainer)


import sys

import numpy as np
import torch
import torch.utils.data
import transformers
import tqdm

from torch.nn.utils.rnn import pad_sequence
from torch.utils.tensorboard import SummaryWriter

MODEL_NAME = 'gpt2'
TEXT_CORPUS = '/data/full_pixiv_harvest_prompt/prompts_1832.txt'
DEVICE = 'cuda'

TOKEN_ENDOFTEXT = 50256  # '<|endoftext|>
BLOCK_LEN = 300
BATCH_SIZE = 16  # Default batch size



########################################################################################################################
def print_it(a, name: str = ''):
    m = a.float().mean() if isinstance(a, torch.Tensor) else a.mean()
    # m = a.mean()
    print(name, a.shape, a.dtype, a.min(), m, a.max())


########################################################################################################################
class MyDset(torch.utils.data.Dataset):
    """A custom dataset that serves 1024-token blocks as input_ids == labels"""
    def __init__(self, data: list[list[int]]):
        self.data = []
        for d in data:
            input_ids = torch.tensor(d, dtype=torch.int64)
            attention_mask = torch.ones(len(d), dtype=torch.int64)
            self.data.append({'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': input_ids})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        return self.data[idx]


########################################################################################################################
def break_text_to_lines(text_path: str, tokenizer: transformers.PreTrainedTokenizer, max_length: int) -> list[list[int]]:
    """
    Read a file line by line and convert each line to a tokenized list, 
    appending a TOKEN_ENDOFTEXT token to each. Truncate lines longer than max_length.

    Args:
    - text_path: Path to the text file.
    - tokenizer: An instance of transformers.PreTrainedTokenizer.
    - max_length: The maximum length for each tokenized line, including TOKEN_ENDOFTEXT.

    Returns:
    A list of tokenized lines, each as a list of integers.
    """
    tokenized_lines = []
    with open(text_path, encoding="utf-8") as f:
        for line in f:
            tokens = tokenizer.encode(line.strip())[:max_length - 1]  # Leave space for TOKEN_ENDOFTEXT
            tokens.append(TOKEN_ENDOFTEXT)  # Ensure the end token is added
            tokenized_lines.append(tokens)
    return tokenized_lines



########################################################################################################################
def train_val_split(data: list[str], ratio: float):
    n = len(data)
    assert n >= 2
    n_val = max(1, int(n * ratio))
    return data[n_val:], data[:n_val]


########################################################################################################################
def prepare_dsets(text_path: str, tokenizer: transformers.PreTrainedTokenizer, block_len: int):
    """Read the text, prepare the datasets """
    data = break_text_to_lines(text_path, tokenizer, block_len)
    data_train, data_val = train_val_split(data, 0.2)
    return MyDset(data_train), MyDset(data_val)




def pad_collate(batch):
    """
    A custom collate function for padding batches dynamically based on the max length
    of the batch samples to allow for varying lengths of sequences within the same batch.
    """
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Pad the sequences to the maximum length in the batch
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks_padded = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 is commonly used as ignore_index in PyTorch
    
    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_masks_padded,
        'labels': labels_padded
    }


########################################################################################################################
def train_one(model: torch.nn.Module, loader: torch.utils.data.DataLoader, optimizer: torch.optim.Optimizer):
    """Standard PyTorch training, one epoch"""
    model.train()
    losses = []
    pbar = tqdm.tqdm(loader)
    for batch in pbar:
        for k, v in batch.items():
            batch[k] = v.to(DEVICE)
        optimizer.zero_grad()
        out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        # loss, logits, past_key_values
        loss = out['loss']
        pbar.set_description(f'loss={loss.item()}')
        loss.backward()
        optimizer.step()
        losses.append(loss.item())

    return np.mean(losses)


########################################################################################################################
def val_one(model: torch.nn.Module, loader: torch.utils.data.DataLoader):
    """Standard PyTorch eval, one epoch"""
    model.eval()
    losses = []
    for batch in tqdm.tqdm(loader):
        for k, v in batch.items():
            batch[k] = v.to(DEVICE)
        with torch.no_grad():
            out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        # loss, logits, past_key_values
        loss = out['loss']
        losses.append(loss.item())

    return np.mean(losses)

def generate_text(model, tokenizer, text, max_length=20, device='cuda'):
    """
    Generates text using the trained model and a starting text.
    
    Args:
    - model: The trained model.
    - tokenizer: The tokenizer for the model.
    - text: Starting text for generation.
    - max_length: The maximum length of the generated text.
    - device: The device to run the generation on ('cuda' or 'cpu').
    
    Returns:
    A string containing the generated text.
    """
    model.eval()
    batch = tokenizer([text], return_tensors='pt')
    for k, v in batch.items():
        batch[k] = v.to(device)
    generated_output = model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], max_length=max_length)
    generated_text = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    return generated_text



# Initialize TensorBoard SummaryWriter
writer = SummaryWriter('./runs/gpt2_training')

# Load model and tokenizer
model = transformers.GPT2LMHeadModel.from_pretrained(MODEL_NAME)
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

# Create datasets and loader
dset_train, dset_val = prepare_dsets(TEXT_CORPUS, tokenizer, BLOCK_LEN)

loader_train = torch.utils.data.DataLoader(dset_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)
loader_val = torch.utils.data.DataLoader(dset_val, batch_size=BATCH_SIZE, collate_fn=pad_collate)


# Optimizer, device
model.to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)


test_prompts = [
    "1girl, long hair, white hair",
    "1girl, purple hair",
    "gothic lolita",
    "hatsune miku",
]

# Training loop
for i_epoch in range(20):
    loss_train = train_one(model, loader_train, optimizer)
    writer.add_scalar('Loss/train', loss_train, i_epoch)
    
    loss_val = val_one(model, loader_val)
    writer.add_scalar('Loss/val', loss_val, i_epoch)
    
    print(f'{i_epoch} : loss_train={loss_train}, loss_val={loss_val}')

    # genrate a sample
    texts = []
    for curr_prompt in test_prompts:
        generated_text = generate_text(model, tokenizer, curr_prompt, 20, DEVICE)
        texts.append(generated_text)

    writer.add_text('Generated', '\n'.join(texts), i_epoch)
    print(f"epoch {i_epoch} : {texts}")
    
    # Save the model if needed
    model.save_pretrained(f'./trained_model/epoch_{i_epoch}/')
    tokenizer.save_pretrained(f'./trained_model/epoch_{i_epoch}/')    


writer.close()



# Now our model is trained, try the generation
sample_text =  "1girl, long hair, white hair" 
generated_text = generate_text(model, tokenizer, sample_text, 20, DEVICE)
print('GENERATION=', generated_text)

 27%|██▋       | 665/2500 [02:17<07:26,  4.11it/s]