# Train a CodeLlama 7b model

## Grab dataset

In [1]:
import wandb
from utils import load_ds_from_artifact, WandbPredictionProgressCallback

In [2]:
WANDB_PROJECT = "wandbot_llm"

TOKENIZED_DATASET_AT = 'capecape/wandbot_llm/wandbot_dataset_tokenized:v0'
TEST_DATASET_AT = 'capecape/wandbot_llm/wandbot_eval_dataset:v0'

In [3]:
run = wandb.init(project=WANDB_PROJECT, job_type="training")

[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
tok_ds = load_ds_from_artifact(TOKENIZED_DATASET_AT)

[34m[1mwandb[0m: Downloading large artifact wandbot_dataset_tokenized:v0, 62.36MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.3


In [5]:
from types import SimpleNamespace

config = SimpleNamespace(
    model_id="codellama/CodeLlama-7b-Instruct-hf",
    layers_to_train=8,  # How many layers we want to train, LLama 7B has 32.
    n_eval_samples=10, # How many samples to generate on validation
    batch_size=1,  # what my GPU can handle, depends on how many layers are we training  
    log_model=False,  # upload the model to W&B?
    freeze_embed = True,  # why train this? let's keep them frozen ❄️
    gradient_checkpointing = True, # save memory
    seed = 42,
)


In [6]:
splitted_ds = tok_ds.train_test_split(train_size=0.99, seed=config.seed)
train_dataset = splitted_ds["train"]
eval_dataset = splitted_ds["test"]

In [7]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

In [8]:
model = AutoModelForCausalLM.from_pretrained(
    config.model_id,
    use_cache=False
    if config.gradient_checkpointing
    else True,  # this is needed for gradient checkpointing
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
tokenizer = AutoTokenizer.from_pretrained(config.model_id)
tokenizer.pad_token = tokenizer.eos_token

## Training

In [10]:
from transformers import Trainer, TrainingArguments, default_data_collator

output_dir = "/tmp/transformers"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    bf16=True,
    learning_rate=2e-4,
    num_train_epochs=1,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    evaluation_strategy="steps",
    # eval_accumulation_steps=4,
    eval_steps=50,
    # logging strategies
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
)

In [11]:
def param_count(m):
    params = sum([p.numel() for p in m.parameters()])/1_000_000
    trainable_params = sum([p.numel() for p in m.parameters() if p.requires_grad])/1_000_000
    print(f"Total params: {params:.2f}M, Trainable: {trainable_params:.2f}M")
    return params, trainable_params

params, trainable_params = param_count(model)

Total params: 6738.55M, Trainable: 6738.55M


In [12]:
n_freeze = 6

# freeze layers (disable gradients)
for param in model.parameters(): param.requires_grad = False
for param in model.lm_head.parameters(): param.requires_grad = True
for param in model.model.layers[-n_freeze:].parameters(): param.requires_grad = True

In [13]:
# Just freeze embeddings for small memory decrease
if config.freeze_embed:
    model.model.embed_tokens.weight.requires_grad_(False);

In [14]:
params, trainable_params = param_count(model)

Total params: 6738.55M, Trainable: 1345.44M


In [15]:
import evaluate
def compute_metrics(eval_preds):
    token_accuracy = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return token_accuracy.compute(predictions=predictions, references=labels)


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=default_data_collator,
    compute_metrics=compute_metrics,
)

In [18]:
test_ds = load_ds_from_artifact(TEST_DATASET_AT)

[34m[1mwandb[0m:   4 of 4 files downloaded.  


In [19]:
test_ds

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'page_content', 'char_len', 'text'],
        num_rows: 132
    })
})

In [20]:
from functools import partial
from transformers import GenerationConfig
from transformers.integrations import WandbCallback
from fastprogress import progress_bar


def _generate(prompt, model, tokenizer, gen_config):
    tokenized_prompt = tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
    with torch.inference_mode():
        output = model.generate(tokenized_prompt, 
                                generation_config=gen_config)
    return tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)


class LLMSampleCB(WandbCallback):
    def __init__(self, trainer, tokenizer, test_dataset, num_samples=10, max_new_tokens=256):
        super().__init__()
        self.sample_dataset = test_dataset["train"].select(range(num_samples))
        self.gen_config = GenerationConfig.from_pretrained(trainer.model.name_or_path,
                                                           max_new_tokens=max_new_tokens)
        self.generate = partial(_generate, 
                                model=trainer.model, 
                                tokenizer=tokenizer, 
                                gen_config=self.gen_config)

    def log_generations_table(self, examples):
        records_table = wandb.Table(columns=["prompt", "generation"] + list(self.gen_config.to_dict().keys()))
        for example in progress_bar(examples, leave=False):
            prompt = example["text"]
            generation = self.generate(prompt=prompt[-1000:])
            records_table.add_data(prompt, generation, *list(self.gen_config.to_dict().values()))
        self._wandb.log({"sample_predictions":records_table})
    
    def on_evaluate(self, args, state, control,  **kwargs):
        self.log_generations_table(self.sample_dataset)

In [21]:
num_samples = 10

wandb_callback = LLMSampleCB(trainer, tokenizer, test_ds, num_samples=num_samples, max_new_tokens=256)

In [22]:
trainer.add_callback(wandb_callback)

In [23]:
trainer.train()

Step,Training Loss,Validation Loss


OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB (GPU 0; 22.19 GiB total capacity; 18.13 GiB already allocated; 810.50 MiB free; 20.29 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [27]:
from utils import save_model

In [28]:
save_model(model, "codellama7")

Using pad_token, but it is not set yet.
Using pad_token, but it is not set yet.


In [29]:
wandb.finish()

0,1
eval/loss,██▆▆▅▅▅▄▄▄▃▃▃▃▃▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
eval/runtime,▁▁▇▇▇██▇▇█▇▇▇▇▇██▇▇█▇▇▇▇███▇▇██████████▇
eval/samples_per_second,██▂▂▂▁▁▂▂▁▂▂▂▂▂▁▁▁▁▁▂▂▁▁▁▁▁▂▂▁▁▁▁▁▁▁▁▁▁▁
eval/steps_per_second,██▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
train/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▂▁▁▁
train/loss,█▇▅▆▄▅▃▃▃▄▃▄▂▃▄▂▃▃▃▂▂▄▂▂▁▂▃▁▂▂▃▃▃▂▃▂▂▂▂▁
train/total_flos,▁▁
train/train_loss,▁▁

0,1
eval/loss,0.89012
eval/runtime,15.9024
eval/samples_per_second,3.144
eval/steps_per_second,0.44
train/epoch,1.0
train/global_step,1214.0
train/learning_rate,0.0
train/loss,0.7856
train/total_flos,1.9713379640593613e+17
train/train_loss,0.91711


## Eval using transformers

In [None]:
import torch, wandb
from datasets import load_from_disk
from transformers import AutoModelForCausalLM

In [None]:
def load_from_artifact(at_address, at_type="dataset"):
    "Load a HF dataset from a W&B artifact"
    artifact = wandb.use_artifact(at_address, type=at_type)
    artifact_dir = artifact.download()
    return load_from_disk(artifact_dir)

In [None]:
model_folder = "64yqh828_codellama7"
EVAL_DATASET_AT = 'capecape/wandbot_llm/wandbot_eval_dataset:v0'

In [None]:
from types import SimpleNamespace

config = SimpleNamespace(
    model_id="codellama/CodeLlama-7b-Instruct-hf",
    dataset_name="alpaca-gpt4",
    precision="bf16",  # faster and better than fp16, requires new GPUs
    n_freeze=24,  # How many layers we don't train, LLama 7B has 32.
    lr=3e-4,
    n_eval_samples=10, # How many samples to generate on validation
    max_seq_len=1024, # Lenght of the sequences to pack
    epochs=1,  # we do one pass over the dataset, we could actually do more
    gradient_accumulation_steps=4,  # evey how many iterations we update the gradients, simulates larger batch sizes
    batch_size=4,  # what my GPU can handle, depends on how many layers are we training  
    # epoch_sz=len(train_dataloader),  # the theorical epoch size, here it's just the steps
    # eval_every=len(train_dataloader)//5,  # every now and then we want to sample from the model
    log_model=False,  # upload the model to W&B?
    mom=0.9, # optim param
    gradient_checkpointing = True,  # saves even more memory
    freeze_embed = True,  # why train this? let's keep them frozen ❄️
)


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_folder,
    device_map="auto",
    torch_dtype=torch.float16,
)

In [None]:
model.eval();

In [None]:
from transformers import GenerationConfig, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(config.model_id)
tokenizer.pad_token = tokenizer.eos_token

gen_config = GenerationConfig.from_pretrained(config.model_id)

def generate(prompt, max_new_tokens=256, gen_config=gen_config):
    tokenized_prompt = tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
    with torch.inference_mode():
        output = model.generate(tokenized_prompt, 
                                max_new_tokens=max_new_tokens, 
                                generation_config=gen_config)
    return tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)

In [None]:
WANDB_PROJECT = "wandbot_llm"

In [None]:
run = wandb.init(project=WANDB_PROJECT, job_type="training")

In [None]:
eval_ds = load_from_artifact(EVAL_DATASET_AT)
eval_ds = eval_ds["train"]

In [None]:
max_new_tokens = 512
generations = []

In [None]:
from fastprogress import progress_bar

for i, sample in progress_bar(enumerate(eval_ds), total=len(eval_ds)):
    output = generate(sample["text"], max_new_tokens=max_new_tokens)
    generations.append([sample, output])
    # except:
    #     print(f"Error in the generation {i}")

In [None]:
list(generations[0][0].keys())

In [None]:
len(generations)

In [None]:
originals_cols = list(generations[0][0].keys())
table = wandb.Table(columns=originals_cols+["codellama7", "max_tokens"])
for gen in generations:
    d, code_answer = gen
    table.add_data(*d.values(), code_answer, max_new_tokens)

In [None]:
wandb.log({"generations": table})

In [None]:
wandb.finish()