In [5]:
import transformers
# https://www.it-jim.com/blog/training-and-fine-tuning-gpt-2-and-gpt-3-models-using-hugging-face-transformers-and-openai-api/


MODEL_NAME = 'gpt2'

In [6]:
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

In [17]:
import unibox as ub
text_50k = ub.loads("s3://dataset-pixiv/misc/prompts_txt_assembled_fullpixiv/prompts_1110.txt")
print(len(text_50k))

# flatten to a single string
text_50k_str = " ".join(text_50k)
print(len(text_50k_str.split(",")))

2024-03-16 08:14:11,268 [INFO] UniLogger: UniLoader.loads: .txt LOADED from "/tmp/tmp2bkdj1pz/prompts_1110.txt" in 0.03s


50000
1011032


In [19]:
enc = tokenizer([text_50k_str], return_tensors='pt')


token_length = enc['input_ids'].shape[1]  # The second dimension size is the number of tokens
print("Token length:", token_length)
# Token length: 3166452

# every 50k prompts: has 1m total tags, and around 3m gpt2 tokens
# so 1m prompts would have 60m tokens
# 100m prompts have about 6000m tokens

Token indices sequence length is longer than the specified maximum sequence length for this model (3166452 > 1024). Running this sequence through the model will result in indexing errors


Token length: 3166452


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


## Train

In [1]:
# By IT-JIM, 2023
# Train GPT-2 with PyTorch (no Trainer)


import sys

import numpy as np
import torch
import torch.utils.data
import transformers
import tqdm

from torch.nn.utils.rnn import pad_sequence
from torch.utils.tensorboard import SummaryWriter


In [2]:
model = transformers.GPT2LMHeadModel.from_pretrained(MODEL_NAME)
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

2024-03-16 03:25:37.739165: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-03-16 03:25:37.787484: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
MODEL_NAME = 'gpt2'
TEXT_CORPUS = '/data/full_pixiv_harvest_prompt/prompts_1832.txt'
DEVICE = 'cuda'

TOKEN_ENDOFTEXT = 50256  # '<|endoftext|>
BLOCK_LEN = 300
BATCH_SIZE = 16  # Default batch size



########################################################################################################################
def print_it(a, name: str = ''):
    m = a.float().mean() if isinstance(a, torch.Tensor) else a.mean()
    # m = a.mean()
    print(name, a.shape, a.dtype, a.min(), m, a.max())


########################################################################################################################
class MyDset(torch.utils.data.Dataset):
    """A custom dataset that serves 1024-token blocks as input_ids == labels"""
    def __init__(self, data: list[list[int]]):
        self.data = []
        for d in data:
            input_ids = torch.tensor(d, dtype=torch.int64)
            attention_mask = torch.ones(len(d), dtype=torch.int64)
            self.data.append({'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': input_ids})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        return self.data[idx]


########################################################################################################################
def break_text_to_lines(text_path: str, tokenizer: transformers.PreTrainedTokenizer, max_length: int) -> list[list[int]]:
    """
    Read a file line by line and convert each line to a tokenized list, 
    appending a TOKEN_ENDOFTEXT token to each. Truncate lines longer than max_length.

    Args:
    - text_path: Path to the text file.
    - tokenizer: An instance of transformers.PreTrainedTokenizer.
    - max_length: The maximum length for each tokenized line, including TOKEN_ENDOFTEXT.

    Returns:
    A list of tokenized lines, each as a list of integers.
    """
    tokenized_lines = []
    with open(text_path, encoding="utf-8") as f:
        for line in f:
            tokens = tokenizer.encode(line.strip())[:max_length - 1]  # Leave space for TOKEN_ENDOFTEXT
            tokens.append(TOKEN_ENDOFTEXT)  # Ensure the end token is added
            tokenized_lines.append(tokens)
    return tokenized_lines



########################################################################################################################
def train_val_split(data: list[str], ratio: float):
    n = len(data)
    assert n >= 2
    n_val = max(1, int(n * ratio))
    return data[n_val:], data[:n_val]


########################################################################################################################
def prepare_dsets(text_path: str, tokenizer: transformers.PreTrainedTokenizer, block_len: int):
    """Read the text, prepare the datasets """
    data = break_text_to_lines(text_path, tokenizer, block_len)
    data_train, data_val = train_val_split(data, 0.2)
    return MyDset(data_train), MyDset(data_val)




def pad_collate(batch):
    """
    A custom collate function for padding batches dynamically based on the max length
    of the batch samples to allow for varying lengths of sequences within the same batch.
    """
    input_ids = [item['input_ids'] for item in batch]
    attention_masks = [item['attention_mask'] for item in batch]
    labels = [item['labels'] for item in batch]

    # Pad the sequences to the maximum length in the batch
    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
    attention_masks_padded = pad_sequence(attention_masks, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)  # -100 is commonly used as ignore_index in PyTorch
    
    return {
        'input_ids': input_ids_padded,
        'attention_mask': attention_masks_padded,
        'labels': labels_padded
    }


########################################################################################################################
def train_one(model: torch.nn.Module, loader: torch.utils.data.DataLoader, optimizer: torch.optim.Optimizer):
    """Standard PyTorch training, one epoch"""
    model.train()
    losses = []
    pbar = tqdm.tqdm(loader)
    for batch in pbar:
        for k, v in batch.items():
            batch[k] = v.to(DEVICE)
        optimizer.zero_grad()
        out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        # loss, logits, past_key_values
        loss = out['loss']
        pbar.set_description(f'loss={loss.item()}')
        loss.backward()
        optimizer.step()
        losses.append(loss.item())

    return np.mean(losses)


########################################################################################################################
def val_one(model: torch.nn.Module, loader: torch.utils.data.DataLoader):
    """Standard PyTorch eval, one epoch"""
    model.eval()
    losses = []
    for batch in tqdm.tqdm(loader):
        for k, v in batch.items():
            batch[k] = v.to(DEVICE)
        with torch.no_grad():
            out = model(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], labels=batch['labels'])
        # loss, logits, past_key_values
        loss = out['loss']
        losses.append(loss.item())

    return np.mean(losses)

def generate_text(model, tokenizer, text, max_length=20, device='cuda'):
    """
    Generates text using the trained model and a starting text.
    
    Args:
    - model: The trained model.
    - tokenizer: The tokenizer for the model.
    - text: Starting text for generation.
    - max_length: The maximum length of the generated text.
    - device: The device to run the generation on ('cuda' or 'cpu').
    
    Returns:
    A string containing the generated text.
    """
    model.eval()
    batch = tokenizer([text], return_tensors='pt')
    for k, v in batch.items():
        batch[k] = v.to(device)
    generated_output = model.generate(input_ids=batch['input_ids'], attention_mask=batch['attention_mask'], max_length=max_length)
    generated_text = tokenizer.decode(generated_output[0], skip_special_tokens=True)
    return generated_text



# Initialize TensorBoard SummaryWriter
writer = SummaryWriter('./runs/gpt2_training')

# Load model and tokenizer
model = transformers.GPT2LMHeadModel.from_pretrained(MODEL_NAME)
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME)

# Create datasets and loader
dset_train, dset_val = prepare_dsets(TEXT_CORPUS, tokenizer, BLOCK_LEN)

loader_train = torch.utils.data.DataLoader(dset_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=pad_collate)
loader_val = torch.utils.data.DataLoader(dset_val, batch_size=BATCH_SIZE, collate_fn=pad_collate)


# Optimizer, device
model.to(DEVICE)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)


test_prompts = [
    "1girl, long hair, white hair",
    "1girl, purple hair",
    "gothic lolita",
    "hatsune miku",
]

# Training loop
for i_epoch in range(20):
    loss_train = train_one(model, loader_train, optimizer)
    writer.add_scalar('Loss/train', loss_train, i_epoch)
    
    loss_val = val_one(model, loader_val)
    writer.add_scalar('Loss/val', loss_val, i_epoch)
    
    print(f'{i_epoch} : loss_train={loss_train}, loss_val={loss_val}')

    # genrate a sample
    texts = []
    for curr_prompt in test_prompts:
        generated_text = generate_text(model, tokenizer, curr_prompt, 20, DEVICE)
        texts.append(generated_text)

    writer.add_text('Generated', '\n'.join(texts), i_epoch)
    print(f"epoch {i_epoch} : {texts}")
    
    # Save the model if needed
    model.save_pretrained(f'./trained_model/epoch_{i_epoch}/')
    tokenizer.save_pretrained(f'./trained_model/epoch_{i_epoch}/')    


writer.close()



# Now our model is trained, try the generation
sample_text =  "1girl, long hair, white hair" 
generated_text = generate_text(model, tokenizer, sample_text, 20, DEVICE)
print('GENERATION=', generated_text)

100%|██████████| 2500/2500 [08:38<00:00,  4.82it/s]
100%|██████████| 625/625 [00:36<00:00, 17.13it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


0 : loss_train=1.8451316284179688, loss_val=1.704955134677887


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


epoch 0 : ['1girl, long hair, white hair, blue eyes, looking at viewer, upper body, simple', '1girl, purple hair, purple eyes, long hair, looking at viewer, upper body, purple', 'gothic lolita fashion, 1girl, long hair, black hair, red eyes, looking', 'hatsune miku, 1boy, male focus, open mouth, white background, simple background']


100%|██████████| 2500/2500 [08:40<00:00,  4.80it/s]
100%|██████████| 625/625 [00:36<00:00, 17.11it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


1 : loss_train=1.6190081714630127, loss_val=1.6452009156227112


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


epoch 1 : ['1girl, long hair, white hair, looking at viewer, simple background, upper body, hair', '1girl, purple hair, purple eyes, long hair, looking at viewer, hair between eyes,', 'gothic lolita fashion, multiple girls, 2girls, black hair, black eyes, black', 'hatsune miku, multiple boys, 2boys, male focus, black hair, white background']


100%|██████████| 2500/2500 [08:40<00:00,  4.80it/s]
100%|██████████| 625/625 [00:36<00:00, 17.12it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


2 : loss_train=1.5347452820777894, loss_val=1.6231368136405946


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


epoch 2 : ['1girl, long hair, white hair, simple background, white background, looking at viewer, eyebrows', '1girl, purple hair, purple eyes, hair ornament, hair flower, looking at viewer, hair', 'gothic lolita fashion, multiple girls, 2girls, black hair, garter straps,', 'hatsune mikuwa, 1girl, 1boy, long hair, black hair, white']


100%|██████████| 2500/2500 [08:39<00:00,  4.81it/s]
100%|██████████| 625/625 [00:36<00:00, 17.11it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


3 : loss_train=1.4676329357147218, loss_val=1.6151107023239135


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


epoch 3 : ['1girl, long hair, white hair, looking at viewer, simple background, upper body, closed', '1girl, purple hair, side ponytail, bodysuit, open mouth, solo focus,', 'gothic lolita fashion, 1girl, black hair, long hair, black eyes, black', 'hatsune mikuwa, 1boy, male focus, white background, simple background, upper']


100%|██████████| 2500/2500 [08:41<00:00,  4.79it/s]
100%|██████████| 625/625 [00:36<00:00, 17.12it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


4 : loss_train=1.4041760246753692, loss_val=1.6121928741455078


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


epoch 4 : ['1girl, long hair, white hair, black legwear, black gloves, black skirt, z', '1girl, purple hair, saliva, blindfold, heart, electricity, drooling, bound,', 'gothic lolita fashion, 1girl, long hair, black hair, gothic lol', 'hatsune miku, 1girl, 1boy, blonde hair, black hair, closed eyes']


100%|██████████| 2500/2500 [08:39<00:00,  4.81it/s]
100%|██████████| 625/625 [00:36<00:00, 17.16it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


5 : loss_train=1.3414155537605286, loss_val=1.6208739398002625


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


epoch 5 : ['1girl, long hair, white hair, blue eyes, looking at viewer, simple background, upper', '1girl, purple hair, fingerless gloves, purple eyes, short hair, large breasts, hair', 'gothic lolita fashion, 1girl, 1boy, long hair, black hair, multiple', 'hatsune mikuwa, multiple boys, 2boys, black hair, male focus, white']


100%|██████████| 2500/2500 [08:40<00:00,  4.81it/s]
100%|██████████| 625/625 [00:36<00:00, 17.13it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


6 : loss_train=1.2767953294754029, loss_val=1.6414961971759796


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


epoch 6 : ['1girl, long hair, white hair, simple background, looking at viewer, upper body, eyebrows', '1girl, purple hair, fingerless gloves, purple eyes, short hair, knee boots, standing', 'gothic lolita hairband, 1girl, black hair, long hair, black legwear', 'hatsune mikuwa, no humans, white background, simple background, fried egg on toast']


100%|██████████| 2500/2500 [08:40<00:00,  4.80it/s]
100%|██████████| 625/625 [00:36<00:00, 17.11it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


7 : loss_train=1.2084411702156066, loss_val=1.6784927740573883


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


epoch 7 : ['1girl, long hair, white hair, blue eyes, looking at viewer, simple background, upper', '1girl, purple hair, purple eyes, long hair, smile, looking at viewer, hair between', 'gothic lolita fashion, 1girl, black hair, gothic lolita, black', 'hatsune mikuwa, 1boy, food, facial hair, mustache, male focus,']


100%|██████████| 2500/2500 [08:39<00:00,  4.82it/s]
100%|██████████| 625/625 [00:36<00:00, 17.13it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


8 : loss_train=1.1370734945535659, loss_val=1.7081873000144958


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


epoch 8 : ['1girl, long hair, white hair, simple background, upper body, looking at viewer, black', '1girl, purple hair, fingerless gloves, from below, electricity, purple eyes, short hair', 'gothic lolita fashion, 1girl, black hair, 1boy, gothic lol', 'hatsune mikuwa (fate) (cosplay), 1boy, arrow (project']


100%|██████████| 2500/2500 [08:38<00:00,  4.82it/s]
100%|██████████| 625/625 [00:36<00:00, 17.10it/s]
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


9 : loss_train=1.0635445720911025, loss_val=1.7782830815315247


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


epoch 9 : ['1girl, long hair, white hair, ponytail, simple background, hair over one eye,', '1girl, purple hair, fingerless gloves, long hair, knee boots, standing, from behind', 'gothic lolita, 1girl, 1boy, brown hair, long hair, black hair', 'hatsune mikuwa (cosplay), food, 1girl, :3, purple hair']


 92%|█████████▏| 2297/2500 [07:58<00:42,  4.80it/s]


KeyboardInterrupt: 

## Aug

combining txts:

In [None]:
import unibox as ub
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor

import subprocess
from tqdm.auto import tqdm
import gc


def concurrent_loads(uris_list, num_workers=8):
    """
    Loads dataframes concurrently from a list of S3 URIs.
    
    :param uris_list: list of S3 URIs (or local) to load
    :param num_workers: int, number of concurrent workers
    :return: list of loaded dataframes

    >>> selected_uris = [f"{base_s3_uri}/{i}.merged.parquet" for i in selected_ids]
    >>> dfs = concurrent_loads(selected_uris, num_workers)
    """
    dfs = []
    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        future_to_uri = {executor.submit(ub.loads, curr_uri, debug_print=False): curr_uri for curr_uri in uris_list}
        
        for future in tqdm(as_completed(future_to_uri), total=len(uris_list), desc="Loading batches"):
            curr_uri = future_to_uri[future]
            try:
                df = future.result()
                dfs.append(df)
            except Exception as e:
                print(f"Exception for {curr_uri}: {e}")
    return dfs

txt_files = ub.traverses("./txts")
txt_lists = concurrent_loads(txt_files)



def flatten(xss):
    return [x for xs in xss for x in xs]

prompt_list = flatten(txt_lists)
print(len(prompt_list))


ub.saves(prompt_list, "full_pixiv_prompts.txt")

## gradio

In [3]:
import gradio as gr
import transformers

# Assuming DEVICE is already defined (e.g., 'cuda' or 'cpu')
DEVICE = 'cuda'  # or 'cpu' if you are not using CUDA

MODEL_DIR = './trained_model/epoch_6/'

model = transformers.GPT2LMHeadModel.from_pretrained(MODEL_DIR)
tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_DIR)
model.to(DEVICE)

def generate_text(prompt, max_length=20):
    """Generates text based on the input prompt."""
    model.eval()  # Set the model to evaluation mode
    input_ids = tokenizer.encode(prompt, return_tensors='pt').to(DEVICE)
    attention_mask = torch.ones(input_ids.shape, device=DEVICE)  # Create an attention mask for the inputs
    output_sequences = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length + len(input_ids[0]),
        temperature=1.0,
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.0,
        do_sample=True,
        num_return_sequences=1,
    )
    
    generated_sequence = output_sequences[0].tolist()
    text = tokenizer.decode(generated_sequence, clean_up_tokenization_spaces=True)
    text = text[: text.find(tokenizer.eos_token)] if tokenizer.eos_token else text  # Remove the end of sequence token

    # Return the generated text
    return text

# Define Gradio interface
iface = gr.Interface(
    fn=generate_text,
    inputs=[gr.Textbox(lines=2, placeholder="Enter your prompt here..."), gr.Slider(minimum=10, maximum=300, value=50)],
    outputs=gr.Textbox(label="Generated Text"),
    title="GPT-2 Text Generation",
    description="This model generates text based on the input prompt. It's fine-tuned from GPT-2."
)

# Launch the interface
iface.launch(share=True)

Running on local URL:  http://127.0.0.1:7861
Running on public URL: https://ac4f5619db477979e7.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Traceback (most recent call last):
  File "/home/ubuntu/miniconda3/lib/python3.10/site-packages/gradio/queueing.py", line 495, in call_prediction
    output = await route_utils.call_process_api(
  File "/home/ubuntu/miniconda3/lib/python3.10/site-packages/gradio/route_utils.py", line 235, in call_process_api
    output = await app.get_blocks().process_api(
  File "/home/ubuntu/miniconda3/lib/python3.10/site-packages/gradio/blocks.py", line 1627, in process_api
    result = await self.call_function(
  File "/home/ubuntu/miniconda3/lib/python3.10/site-packages/gradio/blocks.py", line 1173, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/home/ubuntu/miniconda3/lib/python3.10/site-packages/anyio/to_thread.py", line 33, in run_sync
    return await get