# Codestral fine-tuning with color dataset

2024-12-02 21:20

Attempt to replicate the color fine-tuning on the Codestral model. Unfortunately predict_with_generate is still work-in-progress so I used a merged version of the pull requests in both the `transformers` and `trl` repositories.

PRs merged:
https://github.com/huggingface/transformers/pull/32346
https://github.com/huggingface/trl/pull/2310
https://github.com/huggingface/trl/pull/2311

The output is really strange, makes no sense. The output from the original model is just the model rejecting to do anything at all.
I'd like to try to run it with the original versions of the merged repos.

In [1]:
!pip install \
    "torch==2.3.0" \
    tensorboard

!pip install --upgrade \
    "transformers==4.41.2" \
    "accelerate==0.30.1" \
    "datasets==2.19.1" \
    "peft==0.11.1" \
    "bitsandbytes==0.43.1" \
    "trl==0.8.6" \
    "evaluate==0.4.2" \
    huggingface_hub huggingface

!pip install -U sentencepiece

Collecting torch==2.3.0
  Downloading torch-2.3.0-cp311-cp311-manylinux1_x86_64.whl.metadata (26 kB)
Collecting tensorboard
  Downloading tensorboard-2.18.0-py3-none-any.whl.metadata (1.6 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.3.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu1

In [2]:
import torch
assert torch.cuda.get_device_capability()[0] >= 8, 'Hardware not supported for Flash Attention'

# on a multi-gpu machine
!FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE pip install flash-attn --no-build-isolation

# NOTE: use when 'Hardware not supported for Flash Attention'
# on a single gpu or only cpu machine 
! pip install ninja packaging
! MAX_JOBS=4 pip install flash-attn --no-build

Collecting flash-attn
  Downloading flash_attn-2.7.0.post2.tar.gz (2.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.7/2.7 MB[0m [31m62.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting einops (from flash-attn)
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m22.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: flash-attn
  Building wheel for flash-attn (setup.py) ... [?25ldone
[?25h  Created wheel for flash-attn: filename=flash_attn-2.7.0.post2-py3-none-any.whl size=183275537 sha256=32ced721670a7a25c2b66872e3ac7341a9fa9c5c9e08f06916726ae2ff502b83
  Stored in directory: /root/.cache/pip/wheels/dd/35/a4/958e31aa700d7f22a16771a3964aec266cd1d01ce62d6e16b8
Successfully built flash-attn
Installing collected packages: einops, flas

In [3]:
!git config --global credential.helper store

In [4]:
from huggingface_hub import login
 
login(
  token="<HF_API_KEY_REMOVED>", 
  add_to_git_credential=True
)

In [5]:
!apt install zip -y
!rm -rf data-rb-color
!mkdir -p data-rb-color
!wget "https://www.dropbox.com/scl/fi/vd0ypt9mo9oh0p9tf90h3/dataset-rb-color-fixed.zip?rlkey=bieseudpp5pzko5j4u1n67phq&dl=1" -O model.zip
!unzip model.zip -d data-rb-color

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  unzip
The following NEW packages will be installed:
  unzip zip
0 upgraded, 2 newly installed, 0 to remove and 28 not upgraded.
Need to get 350 kB of archives.
After this operation, 930 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 unzip amd64 6.0-26ubuntu3.2 [175 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 zip amd64 3.0-12build2 [176 kB]
Fetched 350 kB in 1s (340 kB/s)m[33m
debconf: delaying package configuration, since apt-utils is not installed

7[0;23r8[1ASelecting previously unselected package unzip.
(Reading database ... 15865 files and directories currently installed.)
Preparing to unpack .../unzip_6.0-26ubuntu3.2_amd64.deb ...
7[24;0f[42m[30mProgress: [  0%][49m[39m [..........................................................] 87[24;0f[42m

In [6]:
from datasets import load_from_disk
dataset = load_from_disk('data-rb-color')
dataset = dataset.train_test_split(test_size=4/len(dataset))

dataset

DatasetDict({
    train: Dataset({
        features: ['svg', 'html'],
        num_rows: 99996
    })
    test: Dataset({
        features: ['svg', 'html'],
        num_rows: 4
    })
})

In [7]:
mistral_instruct_template = "[INST]{instruction}[/INST]"

system_prompt = """Your job is to turn an input SVG file to HTML and CSS code.
You must generate only HTML and CSS code, no additional text."""

def format_dataset(sample):
    instruction = f"{system_prompt}\n\n" + sample["svg"] + "\n\n"
    sample["prompt"]  = mistral_instruct_template.format(instruction=instruction)
    sample["completion"] = sample["html"]
    return sample

# convert dataset to instruct prompt template
columns_to_remove = list(dataset["train"].column_names)
dataset = dataset.map(format_dataset, remove_columns=columns_to_remove, batched=False)

Map:   0%|          | 0/99996 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [8]:
print(dataset['train'][0])

{'prompt': '[INST]Your job is to turn an input SVG file to HTML and CSS code.\nYou must generate only HTML and CSS code, no additional text.\n\n<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="393" height="852" viewBox="0 0 393 852"><g id="html1"><g data-tag="head" id="head1" data-z-index="auto" data-stacking-context="true" aria-owns="script1"><g data-tag="script" id="script1" data-z-index="auto" data-stacking-context="true"/></g><g data-tag="body" id="body1" data-z-index="auto" data-stacking-context="true" role="document" aria-owns="style1"><g data-stacking-layer="rootBackgroundAndBorders"><rect width="377" height="836" x="8" y="8" fill="rgb(212, 31, 15)"/></g><g data-tag="style" id="style1" data-z-index="auto" data-stacking-context="true"/></g></g></svg>\n\n[/INST]', 'completion': '<body></body>\n\n<style>\n\n        body {\n            background-color: #d41f0f;\n        }\n    \n</style>'}


In [9]:
!mkdir -p sft_cache
!mkdir -p sft_cache/checkpoints
!mkdir -p sft_cache/model
!mkdir -p sft_cache/offload

In [10]:
### model
model_id = "mistral-community/Codestral-22B-v0.1"

### qlora related
r = 64
lora_alpha = 16
lora_dropout = 0.1
task_type = "CAUSAL_LM"

### bitsandbytes related
load_in_4bit=True
bnb_4bit_use_double_quant=True
bnb_4bit_quant_type="nf4"
bnb_4bit_compute_dtype="bfloat16"

### training related
output_dir = "sft_cache/checkpoints"
save_model_dir = "sft_cache/model/"
offload_folder = "sft_cache/offload"
log_dir=f"{output_dir}/logs"

num_train_epochs = 1

per_device_train_batch_size = 1
per_device_eval_batch_size = 1
gradient_accumulation_steps = 1
gradient_checkpointing = True

bf16 = True
fp16 = False

max_grad_norm = 0.3
weight_decay = 0.001
optim = "adamw_torch"

learning_rate = 2e-4
warmup_ratio = 0.03
lr_scheduler_type = "constant"

save_strategy = "no"
logging_steps = 25
logging_strategy = "steps"
group_by_length = True

max_seq_length = 4096
packing = False

In [11]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

tokenizer_config.json:   0%|          | 0.00/960 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/587k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [13]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=load_in_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_use_double_quant=bnb_4bit_use_double_quant,
    bnb_4bit_compute_dtype=getattr(torch, bnb_4bit_compute_dtype),
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    use_cache=False if gradient_checkpointing else True,
    quantization_config=bnb_config,
    device_map="auto"
)
model.config.use_cache = False if gradient_checkpointing else True
model.config.pretraining_tp = 1 # num_of_gpus
model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})

config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/41.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/9 [00:00<?, ?it/s]

model-00001-of-00009.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00002-of-00009.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00009.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00004-of-00009.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00005-of-00009.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00006-of-00009.safetensors:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

model-00007-of-00009.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00008-of-00009.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00009-of-00009.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/9 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [14]:
import bitsandbytes as bnb
from peft import LoraConfig

In [15]:
def find_all_linear_names(model):
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, bnb.nn.Linear4bit):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if "lm_head" in lora_module_names:  # needed for 16-bit
        lora_module_names.remove("lm_head")
    return list(lora_module_names)


# get lora target modules
target_modules = find_all_linear_names(model)

In [16]:
lora_config = LoraConfig(
    r=r,
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    target_modules=target_modules,
    bias="none",
    task_type=task_type,
)

In [17]:
from transformers import TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

In [18]:
# checkout for more info: Train on completions only https://huggingface.co/docs/trl/en/sft_trainer

def formatting_prompts_func(example):
    output_texts = []
    for i in range(len(example['prompt'])):
        text = f"{example['prompt'][i]}\n\n ### Answer: {example['completion'][i]}"
        output_texts.append(text)
    return output_texts

collator = DataCollatorForCompletionOnlyLM(
    response_template="### Answer:", 
    tokenizer=tokenizer
)

In [19]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    #max_steps=max_steps,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    weight_decay=weight_decay,    
    optim=optim,
    learning_rate=learning_rate,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    save_strategy=save_strategy,
    logging_steps=logging_steps,
    logging_strategy=logging_strategy,
    group_by_length=group_by_length,
)

In [20]:
# initialize sft trainer
trainer = SFTTrainer(
    args=training_arguments,
    model=model,
    peft_config=lora_config,
    tokenizer=tokenizer,
    train_dataset=dataset['train'],
    eval_dataset=dataset["test"],
    formatting_func=formatting_prompts_func,
    data_collator=collator,
    max_seq_length=max_seq_length,
    packing=packing,
    compute_metrics=None,
)


Map:   0%|          | 0/99996 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [None]:
trainer.train()



Step,Training Loss
25,0.1727
50,0.0284
75,0.0664
100,0.0477
125,0.0311
150,0.0511
175,0.0568
200,0.0571
225,0.0725
250,0.071


In [None]:
trainer.model.save_pretrained(output_dir, safe_serialization=False)

In [None]:
text = dataset['test'][0]
text

In [None]:
# clear memory
del model
del trainer
torch.cuda.empty_cache()

#### step 6: merge adapter weights and base model

In [None]:
from peft import AutoPeftModelForCausalLM

In [None]:
# load PEFT model in fp16
model = AutoPeftModelForCausalLM.from_pretrained(
    output_dir,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    trust_remote_code=True,  # ATTENTION: This allows remote code execution
)  

In [None]:
print(model)

In [None]:
# merge
merged_model = model.merge_and_unload()

In [None]:
print(merged_model)

In [None]:
# save merged model
merged_model.save_pretrained(save_model_dir, safe_serialization=True,  max_shard_size="2GB")

In [None]:
# save tokenizer for easy inference
tokenizer.save_pretrained(save_model_dir)

In [None]:
del model
del merged_model
del tokenizer

torch.cuda.empty_cache()

### 5. Test and evaluate

In [None]:
# NOTE: restart the kernel and run from this section

#### prepare test dataset

In [None]:
# uncomment the test dataset and run all the cells within section 3: Create and prepare dataset

#### inference: finetuned model

In [None]:
import gc, torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.cuda.empty_cache()
gc.collect()

In [None]:
model_local_path = "sft_cache/model/"
print(f"model_local_path: {model_local_path}")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_local_path, trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token

sft_model = AutoModelForCausalLM.from_pretrained(
    model_local_path,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

In [None]:
eval_sample = dataset['test'][0]
eval_prompt, eval_completion = eval_sample["prompt"], eval_sample["completion"]

print(f"prompt: {eval_prompt}")
print("\n", f"*"*25, "\n")
print(f"completion: {eval_completion}")

In [None]:
model_inputs = tokenizer([eval_prompt], return_tensors="pt").to("cuda")
sft_model.eval()
with torch.no_grad():
    generated_ids = sft_model.generate(
        **model_inputs, max_new_tokens=1000, do_sample=True,
        length_penalty=-5.0, repetition_penalty=2.0, num_beams=10,
    )
    results = tokenizer.batch_decode(generated_ids)[0]
    # prompt_length = model_inputs['input_ids'].shape[1]
    # results = tokenizer.batch_decode(generated_ids[prompt_length:])[0]
    print(results)

#### inference: original model

In [None]:
del sft_model
del tokenizer

In [None]:
import gc, torch
from transformers import AutoModelForCausalLM, AutoTokenizer

torch.cuda.empty_cache()
gc.collect()

In [None]:
model_id = "mistral-community/Codestral-22B-v0.1"
print(f"model_id: {model_id}")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    model_local_path, trust_remote_code=True
)
tokenizer.pad_token = tokenizer.eos_token

base_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    attn_implementation="flash_attention_2",
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

In [None]:
eval_sample = dataset['test'][0]
eval_prompt, eval_completion = eval_sample["prompt"], eval_sample["completion"]

print(f"prompt: {eval_prompt}")
print("\n", f"*"*25, "\n")
print(f"completion: {eval_completion}")

In [None]:
model_inputs = tokenizer([eval_prompt], return_tensors="pt").to("cuda")
base_model.eval()
with torch.no_grad():
    generated_ids = base_model.generate(
        **model_inputs, max_new_tokens=32_000, do_sample=True
    )
    results = tokenizer.batch_decode(generated_ids)[0]
    print(results)

In [None]:
!ls -lh $output_dir