# Train a CodeLlama 7b model

## Grab dataset

In [1]:
import wandb
from datasets import load_from_disk

In [2]:
WANDB_PROJECT = "wandbot_llm"

TOKENIZED_DATASET_AT = 'capecape/wandbot_llm/wandbot_dataset_tokenized:v0'

In [3]:
run = wandb.init(project=WANDB_PROJECT, job_type="training")

[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [16]:
def load_from_artifact(at_address, at_type="dataset"):
    "Load a HF dataset from a W&B artifact"
    artifact = wandb.use_artifact(at_address, type=at_type)
    artifact_dir = artifact.download()
    return load_from_disk(artifact_dir)

In [5]:
train_ds = load_from_artifact(TOKENIZED_DATASET_AT)

[34m[1mwandb[0m: Downloading large artifact wandbot_dataset_tokenized:v0, 62.36MB. 3 files... 
[34m[1mwandb[0m:   3 of 3 files downloaded.  
Done. 0:0:0.2


In [6]:
train_ds

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 4907
})

In [7]:
from types import SimpleNamespace

config = SimpleNamespace(
    model_id="codellama/CodeLlama-7b-Instruct-hf",
    dataset_name="alpaca-gpt4",
    precision="bf16",  # faster and better than fp16, requires new GPUs
    n_freeze=24,  # How many layers we don't train, LLama 7B has 32.
    lr=3e-4,
    n_eval_samples=10, # How many samples to generate on validation
    max_seq_len=1024, # Lenght of the sequences to pack
    epochs=1,  # we do one pass over the dataset, we could actually do more
    gradient_accumulation_steps=4,  # evey how many iterations we update the gradients, simulates larger batch sizes
    batch_size=4,  # what my GPU can handle, depends on how many layers are we training  
    # epoch_sz=len(train_dataloader),  # the theorical epoch size, here it's just the steps
    # eval_every=len(train_dataloader)//5,  # every now and then we want to sample from the model
    log_model=False,  # upload the model to W&B?
    mom=0.9, # optim param
    gradient_checkpointing = True,  # saves even more memory
    freeze_embed = True,  # why train this? let's keep them frozen ❄️
)


In [8]:
import torch
from transformers import AutoModelForCausalLM

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    config.model_id,
    use_cache=False
    if config.gradient_checkpointing
    else True,  # this is needed for gradient checkpointing
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

Downloading (…)lve/main/config.json:   0%|          | 0.00/646 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

Downloading (…)of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

## Training

In [10]:
def save_model(model_name, log=False):
    "Save pytorch model to disk and wandb"
    model_name = f"{wandb.run.id}_{model_name}"
    model.save_pretrained(model_name, safe_serialization=True)
    if log:
        at = wandb.Artifact(model_name, type="model")
        at.add_dir(model_name)
        wandb.log_artifact(at)

In [11]:
# save_model("codellama")

## Train

In [12]:
from transformers import Trainer, TrainingArguments, default_data_collator

output_dir = "/tmp/transformers"
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=config.batch_size,
    bf16=True,
    learning_rate=config.lr,
    num_train_epochs=1,
    gradient_checkpointing=config.gradient_checkpointing,
    # logging strategies
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
)

In [13]:
def param_count(m):
    params = sum([p.numel() for p in m.parameters()])/1_000_000
    trainable_params = sum([p.numel() for p in m.parameters() if p.requires_grad])/1_000_000
    print(f"Total params: {params:.2f}M, Trainable: {trainable_params:.2f}M")
    return params, trainable_params

params, trainable_params = param_count(model)

Total params: 6738.55M, Trainable: 6738.55M


In [14]:
n_freeze = 24

# freeze layers (disable gradients)
for param in model.parameters(): param.requires_grad = False
for param in model.lm_head.parameters(): param.requires_grad = True
for param in model.model.layers[n_freeze:].parameters(): param.requires_grad = True

In [15]:
params, trainable_params = param_count(model)

Total params: 6738.55M, Trainable: 1750.20M


In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    data_collator=default_data_collator,
)

In [17]:
trainer.train()

Step,Training Loss
10,1.448
20,1.2322
30,1.2232
40,1.0981
50,1.105
60,1.0681
70,1.0941
80,1.0483
90,1.061
100,1.0846


TrainOutput(global_step=1227, training_loss=0.8360894948559849, metrics={'train_runtime': 1640.3197, 'train_samples_per_second': 2.991, 'train_steps_per_second': 0.748, 'total_flos': 1.992041884192604e+17, 'train_loss': 0.8360894948559849, 'epoch': 1.0})

In [19]:
save_model("codellama7")

## Eval using transformers

In [1]:
import torch, wandb
from datasets import load_from_disk
from transformers import AutoModelForCausalLM

In [2]:
def load_from_artifact(at_address, at_type="dataset"):
    "Load a HF dataset from a W&B artifact"
    artifact = wandb.use_artifact(at_address, type=at_type)
    artifact_dir = artifact.download()
    return load_from_disk(artifact_dir)

In [3]:
model_folder = "64yqh828_codellama7"
EVAL_DATASET_AT = 'capecape/wandbot_llm/wandbot_eval_dataset:v0'

In [4]:
from types import SimpleNamespace

config = SimpleNamespace(
    model_id="codellama/CodeLlama-7b-Instruct-hf",
    dataset_name="alpaca-gpt4",
    precision="bf16",  # faster and better than fp16, requires new GPUs
    n_freeze=24,  # How many layers we don't train, LLama 7B has 32.
    lr=3e-4,
    n_eval_samples=10, # How many samples to generate on validation
    max_seq_len=1024, # Lenght of the sequences to pack
    epochs=1,  # we do one pass over the dataset, we could actually do more
    gradient_accumulation_steps=4,  # evey how many iterations we update the gradients, simulates larger batch sizes
    batch_size=4,  # what my GPU can handle, depends on how many layers are we training  
    # epoch_sz=len(train_dataloader),  # the theorical epoch size, here it's just the steps
    # eval_every=len(train_dataloader)//5,  # every now and then we want to sample from the model
    log_model=False,  # upload the model to W&B?
    mom=0.9, # optim param
    gradient_checkpointing = True,  # saves even more memory
    freeze_embed = True,  # why train this? let's keep them frozen ❄️
)


In [5]:
model = AutoModelForCausalLM.from_pretrained(
    model_folder,
    device_map="auto",
    torch_dtype=torch.float16,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
model.eval();

In [7]:
from transformers import GenerationConfig, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(config.model_id)
tokenizer.pad_token = tokenizer.eos_token

gen_config = GenerationConfig.from_pretrained(config.model_id)

def generate(prompt, max_new_tokens=256, gen_config=gen_config):
    tokenized_prompt = tokenizer(prompt, return_tensors='pt')['input_ids'].cuda()
    with torch.inference_mode():
        output = model.generate(tokenized_prompt, 
                                max_new_tokens=max_new_tokens, 
                                generation_config=gen_config)
    return tokenizer.decode(output[0][len(tokenized_prompt[0]):], skip_special_tokens=True)

In [8]:
WANDB_PROJECT = "wandbot_llm"

In [9]:
run = wandb.init(project=WANDB_PROJECT, job_type="training")

[34m[1mwandb[0m: Currently logged in as: [33mcapecape[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [10]:
eval_ds = load_from_artifact(EVAL_DATASET_AT)
eval_ds = eval_ds["train"]

[34m[1mwandb[0m:   4 of 4 files downloaded.  


In [11]:
max_new_tokens = 512
generations = []

In [12]:
from fastprogress import progress_bar

for i, sample in progress_bar(enumerate(eval_ds), total=len(eval_ds)):
    output = generate(sample["text"], max_new_tokens=max_new_tokens)
    generations.append([sample, output])
    # except:
    #     print(f"Error in the generation {i}")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


OutOfMemoryError: CUDA out of memory. Tried to allocate 2.96 GiB (GPU 0; 22.19 GiB total capacity; 16.00 GiB already allocated; 622.50 MiB free; 20.58 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [20]:
list(generations[0][0].keys())

['question', 'answer', 'page_content', 'char_len', 'text']

In [31]:
len(generations)

81

In [27]:
originals_cols = list(generations[0][0].keys())
table = wandb.Table(columns=originals_cols+["codellama7", "max_tokens"])
for gen in generations:
    d, code_answer = gen
    table.add_data(*d.values(), code_answer, max_new_tokens)

In [28]:
wandb.log({"generations": table})

In [30]:
wandb.finish()