In [1]:
!pip -q install -U huggingface_hub hf_transfer
!export HF_HUB_ENABLE_HF_TRANSFER=1

import os
from google.colab import userdata
from huggingface_hub import login, whoami

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # mitigate fragmentation
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"
HF_TOKEN = userdata.get('HUGGINFACE_TOKEN')
login(token=HF_TOKEN, add_to_git_credential=True)  # also sets Git creds for LFS

print("Logged in as:", whoami(token=HF_TOKEN)["name"])

Logged in as: j05hr3d


In [2]:
from google.colab import auth
auth.authenticate_user()

project_id = 'code-llm-finetuning'
!gcloud config set project {project_id}

GCS_PATH = "gs://code-llm-fine-tuning-security-analytics/data/source_code.jsonl"
LOCAL_PATH = "/tmp/source_code.jsonl"

!gsutil ls {GCS_PATH}
!gsutil -m cp {GCS_PATH} {LOCAL_PATH}

!ls -lh {LOCAL_PATH}
!head -n 2 {LOCAL_PATH}

#!gsutil cp gs://code-llm-fine-tuning-security-analytics/data/source_code.jsonl /tmp/source_code.jsonl
#!cat /tmp/source_code.jsonl

Updated property [core/project].
gs://code-llm-fine-tuning-security-analytics/data/source_code.jsonl
Copying gs://code-llm-fine-tuning-security-analytics/data/source_code.jsonl...
- [1/1 files][  3.4 KiB/  3.4 KiB] 100% Done                                    
Operation completed over 1 objects/3.4 KiB.                                      
-rw-r--r-- 1 root root 3.4K Sep 12 00:26 /tmp/source_code.jsonl
{"filename": "cryptol/examples/Karatsuba.cry", "content": "/*\n * Copyright (c) 2017 Galois, Inc.\n * Distributed under the terms of the BSD3 license (see LICENSE file)\n *\n * This module defines a Karatsuba multiplier, which is polymorphic over the\n * bitwidth of the constituent limbs and the size of the inputs.\n *\n * Note the trick of using `make_atleast` in the recursive case to teach the\n * typecheker that the current bitwidth is long enough for the split multiplier\n * to be sensible.  The minimum width for which the splitting recursive case\n * works out happens to be 6.  Thi

In [3]:
!pip install -q transformers datasets peft bitsandbytes flash-attn

MODEL = "Qwen/Qwen2.5-Coder-7B"  # Model checkpoint on the Hugging Face Hub
DATASET = "/tmp/source_code.jsonl"
DATA_COLUMN = "content"  # Column name containing the code content

SEQ_LENGTH = 2048  # Sequence length

# Training arguments
MAX_STEPS = 1#2000  # max_steps
BATCH_SIZE = 1  # batch_size
GR_ACC_STEPS = 16  # gradient_accumulation_steps
LR = 5e-4  # learning_rate
LR_SCHEDULER_TYPE = "cosine"  # lr_scheduler_type
WEIGHT_DECAY = 0.01  # weight_decay
NUM_WARMUP_STEPS = 30  # num_warmup_steps
EVAL_FREQ = 100  # eval_freq
SAVE_FREQ = 100  # save_freq
LOG_FREQ = 25  # log_freq
OUTPUT_DIR = "peft-FT-2.5-Coder-7b"  # output_dir
BF16 = True  # bf16
FP16 = False  # no_fp16

# FIM trasformations arguments
FIM_RATE = 0.5  # fim_rate
FIM_SPM_RATE = 0.5  # fim_spm_rate

# LORA
LORA_R = 8  # lora_r
LORA_ALPHA = 32  # lora_alpha
LORA_DROPOUT = 0.05  # lora_dropout
LORA_TARGET_MODULES = "c_proj,c_attn,q_attn,c_fc,c_proj"  # lora_target_modules

# bitsandbytes config
USE_NESTED_QUANT = True  # use_nested_quant
BNB_4BIT_COMPUTE_DTYPE = "bfloat16"  # bnb_4bit_compute_dtype

SEED = 0

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/8.4 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/8.4 MB[0m [31m105.2 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m8.4/8.4 MB[0m [31m111.8 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for flash-attn (setup.py) ... [?25l[?25hdone


In [4]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed,
    BitsAndBytesConfig,
)

set_seed(SEED)

In [5]:
from datasets import load_dataset

DATASET = LOCAL_PATH
DATA_COLUMN = "content"

dataset = load_dataset("json", data_files={"train": DATASET})
train_data = dataset["train"]

# quick sanity check: make sure the field exists
print(train_data[0].keys())
assert DATA_COLUMN in train_data.column_names, f"Missing '{DATA_COLUMN}' in JSONL!"


Generating train split: 0 examples [00:00, ? examples/s]

dict_keys(['filename', 'content'])


In [6]:
import torch
from tqdm import tqdm

tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)

def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """

    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        total_characters += len(example[data_column])
        total_tokens += len(tokenizer(example[data_column]).tokens())

    return total_characters / total_tokens


chars_per_token = chars_token_ratio(train_data, tokenizer, DATA_COLUMN)
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

  0%|          | 1/400 [00:00<00:02, 177.85it/s]

The character to token ratio of the dataset is: 3.19





In [8]:
import functools
import numpy as np


# Helper function to get token ids of the special tokens for prefix, suffix and middle for FIM transformations.
@functools.lru_cache(maxsize=None)
def get_fim_token_ids(tokenizer):
    try:
        FIM_PREFIX, FIM_MIDDLE, FIM_SUFFIX, FIM_PAD = tokenizer.special_tokens_map["additional_special_tokens"][1:5]
        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = (
            tokenizer.vocab[tok] for tok in [FIM_SUFFIX, FIM_PREFIX, FIM_MIDDLE, FIM_PAD]
        )
    except KeyError:
        suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id = None, None, None, None
    return suffix_tok_id, prefix_tok_id, middle_tok_id, pad_tok_id


## Adapted from https://github.com/bigcode-project/Megatron-LM/blob/6c4bf908df8fd86b4977f54bf5b8bd4b521003d1/megatron/data/gpt_dataset.py
def permute(
    sample,
    np_rng,
    suffix_tok_id,
    prefix_tok_id,
    middle_tok_id,
    pad_tok_id,
    fim_rate=0.5,
    fim_spm_rate=0.5,
    truncate_or_pad=False,
):
    """
    Take in a sample (list of tokens) and perform a FIM transformation on it with a probability of fim_rate, using two FIM modes:
    PSM and SPM (with a probability of fim_spm_rate).
    """

    # The if condition will trigger with the probability of fim_rate
    # This means FIM transformations will apply to samples with a probability of fim_rate
    if np_rng.binomial(1, fim_rate):

        # Split the sample into prefix, middle, and suffix, based on randomly generated indices stored in the boundaries list.
        boundaries = list(np_rng.randint(low=0, high=len(sample) + 1, size=2))
        boundaries.sort()

        prefix = np.array(sample[: boundaries[0]], dtype=np.int64)
        middle = np.array(sample[boundaries[0] : boundaries[1]], dtype=np.int64)
        suffix = np.array(sample[boundaries[1] :], dtype=np.int64)

        if truncate_or_pad:
            # calculate the new total length of the sample, taking into account tokens indicating prefix, middle, and suffix
            new_length = suffix.shape[0] + prefix.shape[0] + middle.shape[0] + 3
            diff = new_length - len(sample)

            # trancate or pad if there's a difference in length between the new length and the original
            if diff > 0:
                if suffix.shape[0] <= diff:
                    return sample, np_rng
                suffix = suffix[: suffix.shape[0] - diff]
            elif diff < 0:
                suffix = np.concatenate([suffix, np.full((-1 * diff), pad_tok_id)])

        # With the probability of fim_spm_rateapply SPM variant of FIM transformations
        # SPM: suffix, prefix, middle
        if np_rng.binomial(1, fim_spm_rate):
            new_sample = np.concatenate(
                [
                    [prefix_tok_id, suffix_tok_id],
                    suffix,
                    [middle_tok_id],
                    prefix,
                    middle,
                ]
            )
        # Otherwise, apply the PSM variant of FIM transformations
        # PSM: prefix, suffix, middle
        else:

            new_sample = np.concatenate(
                [
                    [prefix_tok_id],
                    prefix,
                    [suffix_tok_id],
                    suffix,
                    [middle_tok_id],
                    middle,
                ]
            )
    else:
        # don't apply FIM transformations
        new_sample = sample

    return list(new_sample), np_rng


In [9]:
from torch.utils.data import IterableDataset
from torch.utils.data.dataloader import DataLoader
import random

# Create an Iterable dataset that returns constant-length chunks of tokens from a stream of text files.

class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
            fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.
            fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.
            seed (int): Seed for random number generator.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6,
        content_field="content",
        fim_rate=0.5,
        fim_spm_rate=0.5,
        seed=0,
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
        self.content_field = content_field
        self.fim_rate = fim_rate
        self.fim_spm_rate = fim_spm_rate
        self.seed = seed

        (
            self.suffix_tok_id,
            self.prefix_tok_id,
            self.middle_tok_id,
            self.pad_tok_id,
        ) = get_fim_token_ids(self.tokenizer)
        if not self.suffix_tok_id and self.fim_rate > 0:
            print("FIM is not supported by tokenizer, disabling FIM")
            self.fim_rate = 0

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        np_rng = np.random.RandomState(seed=self.seed)
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(next(iterator)[self.content_field])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []

            for tokenized_input in tokenized_inputs:
                # optionally do FIM permutations
                if self.fim_rate > 0:
                    tokenized_input, np_rng = permute(
                        tokenized_input,
                        np_rng,
                        self.suffix_tok_id,
                        self.prefix_tok_id,
                        self.middle_tok_id,
                        self.pad_tok_id,
                        fim_rate=self.fim_rate,
                        fim_spm_rate=self.fim_spm_rate,
                        truncate_or_pad=False,
                    )

                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            examples = []
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    examples.append(input_ids)
            random.shuffle(examples)
            for example in examples:
                self.current_size += 1
                yield {
                    "input_ids": torch.LongTensor(example),
                    "labels": torch.LongTensor(example),
                }


train_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        infinite=True,
        seq_length=SEQ_LENGTH,
        chars_per_token=chars_per_token,
        content_field=DATA_COLUMN,
        fim_rate=FIM_RATE,
        fim_spm_rate=FIM_SPM_RATE,
        seed=SEED,
)
eval_dataset = ConstantLengthDataset(
        tokenizer,
        train_data,
        #valid_data,
        infinite=False,
        seq_length=SEQ_LENGTH,
        chars_per_token=chars_per_token,
        content_field=DATA_COLUMN,
        fim_rate=FIM_RATE,
        fim_spm_rate=FIM_SPM_RATE,
        seed=SEED,
)

In [10]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from peft.tuners.lora import LoraLayer

load_in_8bit = False

# 4-bit quantization
compute_dtype = getattr(torch, BNB_4BIT_COMPUTE_DTYPE)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=USE_NESTED_QUANT,
)

device_map = "auto"#{"": 0}

model = AutoModelForCausalLM.from_pretrained(
        MODEL,
        load_in_8bit=load_in_8bit,
        quantization_config=bnb_config,
        device_map=device_map,
        use_cache=False,  # We will be using gradient checkpointing
        trust_remote_code=True,
        attn_implementation="flash_attention_2",
)

model = prepare_model_for_kbit_training(model)

config.json:   0%|          | 0.00/668 [00:00<?, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.33G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.88G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

In [12]:
blk = model.model.layers[0]           # Llama/Qwen-style
print("ATTN:", blk.self_attn)         # has q_proj, k_proj, v_proj, o_proj
print("MLP:", blk.mlp)
target_modules = ["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]

ATTN: Qwen2Attention(
  (q_proj): Linear4bit(in_features=3584, out_features=3584, bias=True)
  (k_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
  (v_proj): Linear4bit(in_features=3584, out_features=512, bias=True)
  (o_proj): Linear4bit(in_features=3584, out_features=3584, bias=False)
)
MLP: Qwen2MLP(
  (gate_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
  (up_proj): Linear4bit(in_features=3584, out_features=18944, bias=False)
  (down_proj): Linear4bit(in_features=18944, out_features=3584, bias=False)
  (act_fn): SiLU()
)


In [13]:
# Set up lora
peft_config = LoraConfig(
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    r=LORA_R,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=target_modules,
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 20,185,088 || all params: 7,635,801,600 || trainable%: 0.2643


In [14]:
train_data.start_iteration = 0


training_args = TrainingArguments(
    output_dir=f"j05hr3d/{OUTPUT_DIR}",
    dataloader_drop_last=True,
    eval_strategy="steps",
    save_strategy="steps",
    max_steps=MAX_STEPS,
    eval_steps=EVAL_FREQ,
    save_steps=SAVE_FREQ,
    logging_steps=LOG_FREQ,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=LR,
    lr_scheduler_type=LR_SCHEDULER_TYPE,
    warmup_steps=NUM_WARMUP_STEPS,
    gradient_accumulation_steps=GR_ACC_STEPS,
    gradient_checkpointing=True,
    fp16=FP16,
    bf16=BF16,
    weight_decay=WEIGHT_DECAY,
    push_to_hub=True,
    include_tokens_per_second=True,
    report_to=[]
)

In [15]:
trainer = Trainer(
    model=model, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset
)

print("Training...")
trainer.train()

Training...


Casting fp32 inputs back to torch.bfloat16 for flash-attn compatibility.


Step,Training Loss,Validation Loss


TrainOutput(global_step=1, training_loss=0.8815891146659851, metrics={'train_runtime': 19.7849, 'train_samples_per_second': 0.809, 'train_steps_per_second': 0.051, 'train_tokens_per_second': 1656.212, 'total_flos': 1394108836872192.0, 'train_loss': 0.8815891146659851, 'epoch': 1.0})

In [16]:
trainer.push_to_hub()

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

  ...t-FT-2.5-Coder-7b/training_args.bin: 100%|##########| 5.78kB / 5.78kB            

  ...-Coder-7b/adapter_model.safetensors:  52%|#####1    | 41.8MB / 80.8MB            

CommitInfo(commit_url='https://huggingface.co/j05hr3d/peft-FT-2.5-Coder-7b/commit/428cf13cad655087cfb29db84a098fc419f9f3ec', commit_message='End of training', commit_description='', oid='428cf13cad655087cfb29db84a098fc419f9f3ec', pr_url=None, repo_url=RepoUrl('https://huggingface.co/j05hr3d/peft-FT-2.5-Coder-7b', endpoint='https://huggingface.co', repo_type='model', repo_id='j05hr3d/peft-FT-2.5-Coder-7b'), pr_revision=None, pr_num=None)