# Fine-tune SantaCoder on YAML
Sources:
* https://github.com/loubnabnl/santacoder-finetuning
* https://colab.research.google.com/drive/1UMjeXHwOldpLnWjdm1499o2IYy0RgeTw?usp=sharing


### Install required libraries

In [1]:
! pip install "transformers<4.41" datasets ipywidgets torch
! pip install accelerate -U
! pip install bitsandbytes -U



In [2]:
import sys
import logging

nblog = open("1-train-yaml.log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

Autosaving every 5 seconds


In [3]:
# In case very detailed information is needed from the training phase
# from transformers.utils import logging
# logging.set_verbosity_info()

### Log in to HuggingFace to access The Stack

In [4]:
import os
from huggingface_hub import login
# Note: Prefer notebook_login() but this was not prompting properly
login(token=os.environ.get('HUGGINGFACE_TOKEN'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /opt/app-root/src/.cache/huggingface/token
Login successful


### Import required libraries

In [5]:
import torch
from dataclasses import dataclass
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed,
    BitsAndBytesConfig
)
from dataset import ConstantLengthDataset
import pprint

In [6]:
# BitsAndBytesConfig allows the configuration of the BitsAndBytes feature of Hugging Face Transformers.
# This feature enables efficient model inference by reducing the model size and computational requirements.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                     # Enables loading the model in a 4-bit quantized format to reduce memory usage.
    bnb_4bit_use_double_quant=True,        # Activates double quantization, which quantizes not just the weights but also the activations.
    bnb_4bit_quant_type="nf4",             # Sets the quantization type to 'nf4', a 4-bit number format for quantization.
    bnb_4bit_compute_dtype=torch.bfloat16  # Specifies bfloat16 as the data type for computation, balancing precision and speed.
)

### Specify the pre-trained model and dataset

In [7]:
model_id = "bigcode/santacoder"
dataset_id = "bigcode/the-stack-dedup"
data_dir = "data/yaml"

### Load the tokenizer

In [8]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Load the model

In [9]:
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, use_cache=False)

### Load the dataset
Use the first 10% of the dataset

In [10]:
dataset = load_dataset(dataset_id, data_dir=data_dir, split='train[:10%]')

Resolving data files:   0%|          | 0/64 [00:00<?, ?it/s]

### Calculate characters per token

In [11]:
examples, total_characters, total_tokens = 500, 0, 0

for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):
    total_characters += len(example['content'])
    total_tokens += len(tokenizer(example['content']).tokens())

characters_per_token = total_characters / total_tokens
print(characters_per_token)

  0%|          | 0/500 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3514 > 2048). Running this sequence through the model will result in indexing errors


2.4527346671628827


### Create a test split

In [12]:
dataset = dataset.train_test_split(test_size=0.005, shuffle=False, seed=555)
train_ds = dataset["train"]
valid_ds = dataset["test"]
len(train_ds), len(valid_ds)

(525566, 2642)

In [13]:
train_ds = train_ds.shuffle(seed=555)

train_dataset = ConstantLengthDataset(
        tokenizer, train_ds, infinite=True, seq_length=1024, chars_per_token=characters_per_token 
    )
valid_dataset = ConstantLengthDataset(
        tokenizer, valid_ds, infinite=False, seq_length=1024, chars_per_token=characters_per_token
    )

### Define the training arguments

In [14]:
training_args = TrainingArguments(
        output_dir="santacoder-finetuned-the-stack-yaml",
        dataloader_drop_last=True,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        optim="adafactor",
        evaluation_strategy="steps",
        # max_steps=5000,
        # max_steps=1000,
        # eval_steps=500,
        # save_steps=500,
        max_steps=100,
        eval_steps=50,
        save_steps=50,
        logging_steps=10,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        learning_rate=5e-5,
        lr_scheduler_type="cosine",
        warmup_steps=100,
        weight_decay=0.05,
        fp16=False,
        push_to_hub=False,
        logging_strategy='epoch',
        # load_best_model_at_end=True,
)

### Set up logging

In [15]:
from transformers import EarlyStoppingCallback, TrainerCallback
import time
from datetime import datetime, timezone
from zoneinfo import ZoneInfo

In [16]:
current_timezone = 'Europe/Brussels'

class ETAProgressBarCallback(TrainerCallback):
    def __init__(self):
        self.start_time = None
        self.total_steps = None
        self.current_step = 0
        self.average_step_duration = None

    def on_train_begin(self, args, state, control, **kwargs):
        self.start_time = datetime.now(ZoneInfo(current_timezone))
        self.total_steps = state.max_steps
        
        current_time = datetime.now(ZoneInfo(current_timezone))
        str_current_time = current_time.strftime('%Y-%m-%d %H:%M:%S')
        print(f"Starting training at {str_current_time}")

    def on_step_end(self, args, state, control, **kwargs):
        self.current_step += 1
        current_time = datetime.now(ZoneInfo(current_timezone))
        elapsed_time = current_time - self.start_time
        self.average_step_duration = elapsed_time / self.current_step
        estimated_total_time = self.average_step_duration * self.total_steps
        
        estimated_time_left = estimated_total_time - elapsed_time
        days = estimated_time_left.days
        hours, remainder = divmod(estimated_time_left.seconds, 3600)
        minutes, seconds = divmod(remainder, 60)
        estimated_time_left_minutes_formatted = f"{days}d {hours:02d}h:{minutes:02d}m:{seconds:02d}s"
        
        estimated_end_time = (current_time + (estimated_total_time - elapsed_time)).strftime('%Y-%m-%d %H:%M:%S')

        # Calculate iterations per second
        it_per_sec = self.current_step / (elapsed_time.days*3600*24 + elapsed_time.seconds)
        # Access the current epoch from TrainerState
        current_epoch = state.epoch
        
        print(f"Step: {self.current_step}/{self.total_steps}, epoch: {current_epoch:.2f}, It/s: {it_per_sec:.2f}, est. time left: {estimated_time_left_minutes_formatted}, est. end time: {estimated_end_time}")
        
    def on_evaluate(self, args, state, control, model=None, metrics=None, **kwargs):
        # Log evaluation metrics after every evaluation phase
        if metrics:
            print("Evaluation metrics:")
            for key, value in metrics.items():
                print(f"{key}: {value}")

    def on_train_end(self, args, state, control, model=None, **kwargs):
        current_time = datetime.now(ZoneInfo(current_timezone))
        str_current_time = current_time.strftime('%Y-%m-%d %H:%M:%S')
        print(f"Ended training at {str_current_time}")

### Train

In [17]:
train_dataset.start_iteration = 0

In [18]:
trainer = Trainer(
    #tokenizer=tokenizer,
    model=model, args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=valid_dataset,
    callbacks=[
        # EarlyStoppingCallback(
        #     early_stopping_patience=5,
        #     early_stopping_threshold=1.0),
        ETAProgressBarCallback()]
)

max_steps is given, it will override any value given in num_train_epochs


In [None]:
trainer.train()

Starting training at 2024-07-08 13:13:15




Step: 1/100, epoch: 0.01, It/s: 0.01, est. time left: 0d 04h:28m:40s, est. end time: 2024-07-08 17:44:38


Step,Training Loss,Validation Loss


Step: 2/100, epoch: 0.02, It/s: 0.01, est. time left: 0d 04h:21m:34s, est. end time: 2024-07-08 17:40:09
Step: 3/100, epoch: 0.03, It/s: 0.01, est. time left: 0d 04h:20m:23s, est. end time: 2024-07-08 17:41:42
Step: 4/100, epoch: 0.04, It/s: 0.01, est. time left: 0d 04h:15m:56s, est. end time: 2024-07-08 17:39:51
Step: 5/100, epoch: 0.05, It/s: 0.01, est. time left: 0d 04h:13m:11s, est. end time: 2024-07-08 17:39:46
Step: 6/100, epoch: 0.06, It/s: 0.01, est. time left: 0d 04h:09m:34s, est. end time: 2024-07-08 17:38:46
Step: 7/100, epoch: 0.07, It/s: 0.01, est. time left: 0d 04h:07m:04s, est. end time: 2024-07-08 17:38:55
Step: 8/100, epoch: 0.08, It/s: 0.01, est. time left: 0d 04h:03m:43s, est. end time: 2024-07-08 17:38:10
Step: 9/100, epoch: 0.09, It/s: 0.01, est. time left: 0d 04h:01m:03s, est. end time: 2024-07-08 17:38:08
Step: 10/100, epoch: 0.10, It/s: 0.01, est. time left: 0d 03h:58m:01s, est. end time: 2024-07-08 17:37:43
Step: 11/100, epoch: 0.11, It/s: 0.01, est. time left:

In [None]:
trainer.save_model() # this creates the config.json file etc you need to run generation tests

### Evaluate