In [1]:
from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

model_ckpt = "susnato/codeparrot"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
config = AutoConfig.from_pretrained("gpt2-xl", vocab_size=len(tokenizer))
model = AutoModelForCausalLM.from_config(config)

2023-01-03 19:46:17.990179: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-03 19:46:18.115804: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-01-03 19:46:18.115823: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-01-03 19:46:18.139726: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-01-03 19:46:18.602820: W tensorflow/stream_executor/platform/de

Downloading:   0%|          | 0.00/689 [00:00<?, ?B/s]

In [3]:
def model_size(model):
    return sum(t.numel() for t in model.parameters())

print(f"GPT-2 (xl) size : {model_size(model)/1000**2:.1f} M parameters")
model.save_pretrained("models/"+model_ckpt, push_to_hub=True)

GPT-2 (xl) size : 1529.6 M parameters


In [8]:
config_small = AutoConfig.from_pretrained("gpt2", vocab_size=len(tokenizer))
model_small = AutoModelForCausalLM.from_config(config_small)

print(f"GPT-2 size : {model_size(model_small)/1000**2:.1f} M parameters")
model_small.save_pretrained("models/"+model_ckpt+"-small", push_to_hub=True)

GPT-2 size : 111.0 M parameters


In [6]:
import tqdm
from datasets import load_dataset

examples, total_characters, total_tokens = 500, 0, 0
dataset = load_dataset("transformersbook/codeparrot-train", split="train", streaming=True)

for _, example in tqdm.tqdm(zip(range(examples), iter(dataset)), total=examples):
    total_characters += len(example["content"])
    total_tokens += len(tokenizer(example["content"]).tokens())

characters_per_token = total_characters / total_tokens

print(characters_per_token)

Using custom data configuration transformersbook--codeparrot-train-ba60c789679753de
  0%|          | 1/500 [00:02<16:53,  2.03s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (2605 > 1024). Running this sequence through the model will result in indexing errors
100%|██████████| 500/500 [00:05<00:00, 86.97it/s] 

3.6233025034779565





In [9]:
import torch
from torch.utils.data import IterableDataset

class ConstantLengthDataset(IterableDataset):
    def __init__(self, tokenizer, dataset, seq_length=1024,
                 num_of_sequences=1024, chars_per_token=3.6):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.input_characters = seq_length * chars_per_token * num_of_sequences

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.input_characters:
                    m = f"Buffer Full : {buffer_len}>={self.input_characters:.0f}"
                    print(m)
                    break
                try:
                    m = f"Fill Buffer: {buffer_len}<{self.input_characters:.0f}"
                    print(m)
                    buffer.append(next(iterator)["content"])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    iterator = iter(self.dataset)

            all_token_ids = []
            tokenized_inputs = self.tokenizer(buffer, truncation=False)
            for tokenized_input in tokenized_inputs["input_ids"]:
                all_token_ids.extend(tokenized_input+[self.concat_token_id])

            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i+self.seq_length]
                if len(input_ids) == self.seq_length:
                    yield torch.tensor(input_ids)

In [10]:
shuffled_dataset = dataset.shuffle(buffer_size=100)
constant_length_dataset = ConstantLengthDataset(tokenizer, shuffled_dataset, num_of_sequences=10)
dataset_iterator = iter(constant_length_dataset)
lengths = [len(b) for _,b in zip(range(5), dataset_iterator)]
print(f"Lengths of the sequences : {lengths}")

Fill Buffer: 0<36864
Fill Buffer: 4570<36864
Fill Buffer: 6565<36864
Fill Buffer: 8721<36864
Fill Buffer: 27762<36864
Fill Buffer: 30826<36864
Fill Buffer: 34660<36864
Buffer Full : 43106>=36864
Lengths of the sequences : [1024, 1024, 1024, 1024, 1024]


In [15]:
from argparse import Namespace

# Commented parameters correspond to the small model
config = {"train_batch_size": 2, # 12
          "valid_batch_size": 2, # 12
          "weight_decay": 0.1,
          "shuffle_buffer": 1000,
          "learning_rate": 2e-4, # 5e-4
          "lr_scheduler_type": "cosine",
          "num_warmup_steps": 750, # 2000
          "gradient_accumulation_steps": 16, # 1
          "max_train_steps": 50000, # 150000
          "max_eval_steps": -1,
          "seq_length": 1024,
          "seed": 1,
          "save_checkpoint_steps": 50000} # 15000

args = Namespace(**config)

In [18]:
args

Namespace(train_batch_size=2, valid_batch_size=2, weight_decay=0.1, shuffle_buffer=1000, learning_rate=0.0002, lr_scheduler_type='cosine', num_warmup_steps=750, gradient_accumulation_steps=16, max_train_steps=50000, max_eval_steps=-1, seq_length=1024, seed=1, save_checkpoint_steps=50000)

In [54]:
import wandb
import logging
import datasets
import transformers
from accelerate import Accelerator
from torch.utils.tensorboard import SummaryWriter

accelerator = Accelerator()

def setup_logging(project_name):
    logger = logging.getLogger(__name__)
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO, handlers=[
                    logging.FileHandler(f"log/debug_{accelerator.process_index}.log"),
                    logging.StreamHandler()]
    )
    if accelerator.is_main_process: # We only want to set up logging once
        wandb.init(project=project_name, config=args)
        run_name = wandb.run.name
        tb_writer = SummaryWriter()
        tb_writer.add_hparams(vars(args), {'0': 0})
        logger.setLevel(logging.INFO)
        datasets.utils.logging.set_verbosity_debug()
        transformers.utils.logging.set_verbosity_info()
    else:
        tb_writer = None
        run_name = ''
        logger.setLevel(logging.ERROR)
        datasets.utils.logging.set_verbosity_error()
        transformers.utils.logging.set_verbosity_error()
    return logger, tb_writer, run_name

In [55]:
from torch.utils.data.dataloader import DataLoader

def create_dataloaders(dataset_name):
    train_data = load_dataset(dataset_name+'-train', split="train",
                              streaming=True)
    train_data = train_data.shuffle(buffer_size=args.shuffle_buffer,
                                    seed=args.seed)
    train_dataset = ConstantLengthDataset(tokenizer, train_data,
                                          seq_length=args.seq_length)
    train_dataloader=DataLoader(train_dataset, batch_size=args.train_batch_size)

    valid_data = load_dataset(dataset_name+'-valid', split="validation",
                              streaming=True)
    valid_dataset = ConstantLengthDataset(tokenizer, valid_data,
                                          seq_length=args.seq_length)
    eval_dataloader=DataLoader(valid_dataset, batch_size=args.valid_batch_size)
    return train_dataloader, eval_dataloader


In [56]:
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)

    return [{"params":params_with_wd, "weight_decay":args.weight_decay},
            {"params":params_without_wd, "weight_decay":0.0}]

In [60]:
from torch.optim import AdamW
from transformers import set_seed
from transformers import get_scheduler
from huggingface_hub import Repository

set_seed(args.seed)
project_name = "codeparrot-small"
model_to_be_trained = "codeparrot-small"

samples_per_step = accelerator.state.num_processes * args.train_batch_size

#Logging
logger, tb_writer, run_name = setup_logging(project_name)
logger.info(accelerator.state)

#Load model and tokenizer
if accelerator.is_main_process:
    hf_repo = Repository("./Training_files/",
                         clone_from="susnato/codeparrot-training-from-scratch")
model = AutoModelForCausalLM.from_pretrained(f"susnato/{model_to_be_trained}",
                                             gradient_checkpointing=True)
tokenizer = AutoTokenizer.from_pretrained("susnato/codeparrot")

#Load Dataset and DataLoader
train_dl, eval_dl = create_dataloaders("transformersbook/codeparrot")

#Prepare the optimizer and learning rate scheduler
optimizer = AdamW(get_grouped_params(model), lr=args.learning_rate)
lr_scheduler = get_scheduler(name=args.lr_scheduler_type, optimizer=optimizer,
                          num_warmup_steps=args.num_warmup_steps,
                          num_training_steps=args.max_train_steps)

def get_lr():
    return optimizer.param_groups[0]["lr"]

model, optimizer, train_dl, eval_dl = accelerator.prepare(model, optimizer, train_dl, eval_dl)


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01667050618337574, max=1.0)…

01/04/2023 00:59:37 - INFO - __main__ - Distributed environment: NO
Num processes: 1
Process index: 0
Local process index: 0
Device: cuda
Mixed precision type: no



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

OSError: Tried to clone a repository in a non-empty folder that isn't a git repository. If you really want to do this, do it manually:
git init && git remote add origin && git pull origin main
 or clone repo to a new folder and move your existing files there afterwards.

In [25]:
def evaluate():
    model.eval()
    losses = []
    for step, batch in enumerate(eval_dl):
        with torch.no_grad():
            opts = model(batch, labels=batch)
            loss = opts.loss.repeat(args.valid_batch_size)
            losses.append(accelerator.gather(loss))
            if args.max_eval_steps > 0 and step >= args.max_eval_steps:
                break
    loss = torch.mean(torch.cat(losses))
    try:
        perplexity = torch.exp(loss)
    except:
        OverflowError:
        perplexity = torch.tensor(float("inf"))
    return loss.item(), perplexity.item()

def log_metrics(step, metrics):
    logger.info(f"Step {step} : {metrics}")
    if accelerator.is_main_process:
        wandb.log(metrics)
        [tb_writer.add_scalar(k, v, step) for k, v in metrics.items()]

SyntaxError: invalid syntax (3342883810.py, line 15)

In [None]:
#Train Model
model.train()
completed_steps = 0
for step, batch in enumerate(train_dl, start=1):
    loss = model(batch, labels=batch).loss
    log_metrics(step, {"lr":get_lr(), "samples":step*samples_per_step,
                       "steps":completed_steps, "loss/train":loss.item()})
    loss = loss/args.gradient_accumulation_steps
    accelerator.backward(loss)
    if step % args.gradient_accumulation_steps == 0:
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        completed_steps += 1
    if step % args.save_checkpoint_steps == 0:
        logger.info('Evaluating and saving model checkpoint')
        eval_loss, perplexity = evaluate()
        log_metrics(step, {"loss/eval":eval_loss, "perplexity":perplexity})
        accelerator.wait_for_everyone()
        unwrapped_model = accelerator.unwrap_model(model)
        if accelerator.is_main_process:
            unwrapped_model.save_pretrained(f"Training_files/models/{model_to_be_trained}")
            hf_repo.push_to_hub(commit_message=f"Step - {step}")
        model.train()
    if completed_steps >= args.max_train_steps:
        break

logger.info("Evaluating and Saving model after training")
eval_loss, perplexity = evaluate()
log_metrics(step, {"loss/eval":eval_loss, "perplexity":perplexity})
accelerator.wait_for_everyone()
unwrapped_model = accelerator.unwrap_model(model)
if accelerator.is_main_process:
    unwrapped_model.save_pretrained(f"Training_files/models/{model_to_be_trained}")
    hf_repo.push_to_hub(commit_message=f"Final Model")