# Training a Model from Scratch

In [10]:
from argparse import Namespace

# Commented parameters correspond to the small model
config = {"train_batch_size": 2, # 12
          "valid_batch_size": 2, # 12
          "weight_decay": 0.1,
          "shuffle_buffer": 1000,
          "learning_rate": 2e-4, # 5e-4
          "lr_scheduler_type": "cosine",
          "num_warmup_steps": 5, # 2000
          "gradient_accumulation_steps": 16, # 1
          "max_train_steps": 10, # 150000
          "max_eval_steps": -1,
          "seq_length": 1024,
          "seed": 1,
          "save_checkpoint_steps": 10} # 15000

args = Namespace(**config)

In [11]:
from torch.utils.tensorboard import SummaryWriter
import logging
import wandb
import datasets
import transformers

def setup_logging(project_name):
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, handlers=[
        logging.FileHandler(f"MyOwnLLM/log/debug_{accelerator.process_index}.log"),
        logging.StreamHandler()])
    if accelerator.is_main_process: # We only want to set up logging once
        wandb.init(project=project_name, config=args)
        run_name = wandb.run.name
        tb_writer = SummaryWriter()
        tb_writer.add_hparams(vars(args), {'0': 0})
        logger.setLevel(logging.INFO)
        datasets.utils.logging.set_verbosity_debug()
        transformers.utils.logging.set_verbosity_info()
    else:
        tb_writer = None
        run_name = ''
        logger.setLevel(logging.ERROR)
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()
    return logger, tb_writer, run_name

In [12]:
def log_metrics(step, metrics):
    logger.info(f"Step {step}: {metrics}")
    if accelerator.is_main_process:
        wandb.log(metrics)
        [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]

In [13]:
from torch.utils.data import IterableDataset
import torch
class ConstantLengthDataset(IterableDataset):
    
    def __init__(self, tokenizer, dataset, seq_length=1024,
                 num_of_sequences=1024, chars_per_token=3.6):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.bos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.input_characters = seq_length * chars_per_token * num_of_sequences
    
    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.input_characters:
                    break
                try:
                    buffer.append(next(iterator)['content'])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    more_examples = False
                    break
            tokenized_inputs = tokenizer(buffer, truncation=False)['input_ids']
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    yield torch.tensor(input_ids)

In [14]:
from torch.utils.data.dataloader import DataLoader
def create_dataloaders(dataset_name):
    train_data = load_dataset(dataset_name+'-train', split="train",
                              streaming=True)
    train_data = train_data.shuffle(buffer_size=args.shuffle_buffer,
                                    seed=args.seed)
    valid_data = load_dataset(dataset_name+'-valid', split="validation",
                              streaming=True)
    train_dataset = ConstantLengthDataset(tokenizer, train_data,
                                          seq_length=args.seq_length)
    valid_dataset = ConstantLengthDataset(tokenizer, valid_data,
                                          seq_length=args.seq_length)
    train_dataloader=DataLoader(train_dataset, batch_size=args.train_batch_size)
    eval_dataloader=DataLoader(valid_dataset, batch_size=args.valid_batch_size)
    return train_dataloader, eval_dataloader

In [15]:
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [{'params': params_with_wd, 'weight_decay': args.weight_decay},
            {'params': params_without_wd, 'weight_decay': 0.0}]

In [16]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dataloader):
        with torch.no_grad():
            outputs = model(batch, labels=batch)
        loss = outputs.loss.repeat(args.valid_batch_size)
        losses.append(accelerator.gather(loss))
        if args.max_eval_steps > 0 and step >= args.max_eval_steps:
            break
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except OverflowError:
        perplexity = torch.tensor(float("inf"))
    return loss.item(), perplexity.item()


In [17]:
import argparse
from accelerate import Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer, AdamW, get_scheduler
from datasets import load_dataset
from huggingface_hub import Repository
from transformers import pipeline, set_seed

# Include any other libraries referenced in your custom functions (e.g., setup_logging, create_dataloaders, log_metrics, evaluate, get_grouped_params)
import logging
import math

set_seed(args.seed)
project_name = 'transformersbook/codeparrot'
dataset_name = 'transformersbook/codeparrot'
# Accelerator
accelerator = Accelerator()
samples_per_step = accelerator.state.num_processes * args.train_batch_size

# Logging
logger, tb_writer, run_name = setup_logging(project_name.split("/")[1])


# Load model and tokenizer
if accelerator.is_main_process:
    hf_repo = Repository("MyOwnLLM/", clone_from=project_name, revision=run_name)
model = AutoModelForCausalLM.from_pretrained("MyOwnLLM/", gradient_checkpointing=True)
tokenizer = AutoTokenizer.from_pretrained("MyOwnLLM/")

# Load dataset and dataloader
train_dataloader, eval_dataloader = create_dataloaders(dataset_name)

# Prepare the optimizer and learning rate scheduler
optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,
                             num_warmup_steps=args.num_warmup_steps,
                             num_training_steps=args.max_train_steps,)
def get_lr():
    return optimizer.param_groups[0]['lr']

# Prepare everything with our `accelerator` (order of args is not important)
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
    model, optimizer, train_dataloader, eval_dataloader)

# Train model
model.train()
completed_steps = 0
for step, batch in enumerate(train_dataloader, start=1):
    loss = model(batch, labels=batch).loss
    log_metrics(step, {'lr': get_lr(), 'samples': step*samples_per_step,
                       'steps': completed_steps, 'loss/train': loss.item()})
    loss = loss / args.gradient_accumulation_steps
    accelerator.backward(loss)
    if step % args.gradient_accumulation_steps == 0:
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        completed_steps += 1
    if step % args.save_checkpoint_steps == 0:
        logger.info('Evaluating and saving model checkpoint')
        eval_loss, perplexity = evaluate()
        log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        if accelerator.is_main_process:
            unwrapped_model.save_pretrained("./")
            hf_repo.push_to_hub(commit_message=f'step {step}')
        model.train()
    if completed_steps >= args.max_train_steps:
        break

# Evaluate and save the last checkpoint
logger.info('Evaluating and saving model after training')
eval_loss, perplexity = evaluate()
log_metrics(step, {'loss/eval': eval_loss, 'perplexity': perplexity})
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
if accelerator.is_main_process:
    unwrapped_model.save_pretrained("./")
    hf_repo.push_to_hub(commit_message=f'final model')

VBox(children=(Label(value='0.038 MB of 0.038 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
loss/train,▁▁▃▄▆▁▆▇▇▆▁▂▁▁▁▁▃▂▃▂▂▅▃▂▄▃▁▅▅█▁▁▁▃▄▅▄▂▁▁
lr,▁▁▁▂▂▂▂▂▃▃▃▃▃▃▅▅▅▅▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇▇██
samples,▁▁▂▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
steps,▁▁▁▁▁▁▂▂▂▂▂▂▂▃▃▃▃▃▅▅▅▅▅▅▅▆▆▆▆▆▆▆▇▇▇▇▇▇▇█

0,1
loss/train,0.14362
lr,2e-05
samples,200.0
steps,6.0


For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
c:\Users\hedin\Documents\MyOwnLLM\MyOwnLLM/ is already a clone of https://huggingface.co/transformersbook/codeparrot. Make sure you pull the latest changes with `repo.git_pull()`.
Revision `comic-deluge-19` does not exist. Created and checked out branch `comic-deluge-19`.

loading configuration file MyOwnLLM/config.json
Model config GPT2Config {
  "_name_or_path": "MyOwnLLM/",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "gradient_checkpointing": true,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1600,
  "n_head": 25,
  "n_inner": null,
  "n_layer": 48,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": true,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": true,
  "s

Resolving data files:   0%|          | 0/183 [00:00<?, ?it/s]

Using custom data configuration default-5e86e85b62bb20a2
11/23/2024 08:07:24 - INFO - datasets.builder - Using custom data configuration default-5e86e85b62bb20a2
Loading Dataset Infos from c:\Users\hedin\anaconda3\envs\llm_lab\lib\site-packages\datasets\packaged_modules\json
11/23/2024 08:07:24 - INFO - datasets.info - Loading Dataset Infos from c:\Users\hedin\anaconda3\envs\llm_lab\lib\site-packages\datasets\packaged_modules\json
Attempting to acquire lock 1774107271664 on \\?\C:\Users\hedin\.cache\huggingface\datasets\_Users_hedin_.cache_huggingface_datasets_transformersbook___codeparrot-train_default-5e86e85b62bb20a2_0.0.0_8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96.lock
11/23/2024 08:07:24 - DEBUG - datasets.utils.filelock - Attempting to acquire lock 1774107271664 on \\?\C:\Users\hedin\.cache\huggingface\datasets\_Users_hedin_.cache_huggingface_datasets_transformersbook___codeparrot-train_default-5e86e85b62bb20a2_0.0.0_8bb11242116d547c741b2e8a1f18598ffdd40a1d4

KeyboardInterrupt: 

In [19]:

#hide_output
from transformers import pipeline, set_seed

model_ckpt = 'MyOwnLLM'
generation = pipeline('text-generation', model=model_ckpt, device=0)

loading configuration file MyOwnLLM\config.json
Model config GPT2Config {
  "_name_or_path": "MyOwnLLM",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 0,
  "gradient_checkpointing": false,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1600,
  "n_head": 25,
  "n_inner": null,
  "n_layer": 48,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": true,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": true,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.16.2",
  "use_cache"

All model checkpoint weights were used when initializing GPT2LMHeadModel.

All the weights of GPT2LMHeadModel were initialized from the model checkpoint at MyOwnLLM.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GPT2LMHeadModel for predictions without further training.
Didn't find file MyOwnLLM\added_tokens.json. We won't load it.
loading file MyOwnLLM\vocab.json
loading file MyOwnLLM\merges.txt
loading file MyOwnLLM\tokenizer.json
loading file None
loading file MyOwnLLM\special_tokens_map.json
loading file MyOwnLLM\tokenizer_config.json


AssertionError: Torch not compiled with CUDA enabled

In [22]:
torch.cuda.is_available()

False