# Fine-tune SantaCoder on YAML
Sources:
* https://github.com/loubnabnl/santacoder-finetuning
* https://colab.research.google.com/drive/1UMjeXHwOldpLnWjdm1499o2IYy0RgeTw?usp=sharing


### Install required libraries

In [None]:
! pip install "transformers<4.41" datasets ipywidgets torch
! pip install accelerate -U
! pip install bitsandbytes -U

Collecting transformers<4.41
  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting ipywidgets
  Downloading ipywidgets-8.1.3-py3-none-any.whl.metadata (2.4 kB)
Collecting torch
  Downloading torch-2.3.1-cp39-cp39-manylinux1_x86_64.whl.metadata (26 kB)
Collecting filelock (from transformers<4.41)
  Downloading filelock-3.15.4-py3-none-any.whl.metadata (2.9 kB)
Collecting huggingface-hub<1.0,>=0.19.3 (from transformers<4.41)
  Downloading huggingface_hub-0.23.4-py3-none-any.whl.metadata (12 kB)
Collecting numpy>=1.17 (from transformers<4.41)
  Downloading numpy-2.0.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.9/60.9 kB[0m [31m61.3 MB/s[0m eta [36m0:00:00

In [None]:
import sys
import logging

nblog = open("1-train-yaml.log", "a+")
sys.stdout.echo = nblog
sys.stderr.echo = nblog

get_ipython().log.handlers[0].stream = nblog
get_ipython().log.setLevel(logging.INFO)

%autosave 5

### Log in to HuggingFace to access The Stack

In [None]:
import os
from huggingface_hub import login
# Note: Prefer notebook_login() but this was not prompting properly
login(token=os.environ.get('HUGGINGFACE_TOKEN'))

### Import required libraries

In [None]:
import torch
from dataclasses import dataclass
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed,
    BitsAndBytesConfig
)
from dataset import ConstantLengthDataset

In [None]:
# BitsAndBytesConfig allows the configuration of the BitsAndBytes feature of Hugging Face Transformers.
# This feature enables efficient model inference by reducing the model size and computational requirements.
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,                     # Enables loading the model in a 4-bit quantized format to reduce memory usage.
    bnb_4bit_use_double_quant=True,        # Activates double quantization, which quantizes not just the weights but also the activations.
    bnb_4bit_quant_type="nf4",             # Sets the quantization type to 'nf4', a 4-bit number format for quantization.
    bnb_4bit_compute_dtype=torch.bfloat16  # Specifies bfloat16 as the data type for computation, balancing precision and speed.
)

### Specify the pre-trained model and dataset

In [None]:
model_id = "bigcode/santacoder"
dataset_id = "bigcode/the-stack-dedup"
data_dir = "data/yaml"

### Load the tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

### Load the model

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, use_cache=False)

### Load the dataset
Use the first 10% of the dataset

In [None]:
dataset = load_dataset(dataset_id, data_dir=data_dir, split='train[:10%]')

### Calculate characters per token

In [None]:
examples, total_characters, total_tokens = 500, 0, 0

for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):
    total_characters += len(example['content'])
    total_tokens += len(tokenizer(example['content']).tokens())

characters_per_token = total_characters / total_tokens
print(characters_per_token)

### Create a test split

In [None]:
dataset = dataset.train_test_split(test_size=0.005, shuffle=False, seed=555)
train_ds = dataset["train"]
valid_ds = dataset["test"]
len(train_ds), len(valid_ds)

In [None]:
train_ds = train_ds.shuffle(seed=555)

train_dataset = ConstantLengthDataset(
        tokenizer, train_ds, infinite=True, seq_length=1024, chars_per_token=characters_per_token 
    )
valid_dataset = ConstantLengthDataset(
        tokenizer, valid_ds, infinite=False, seq_length=1024, chars_per_token=characters_per_token
    )

### Define the training arguments

In [None]:
training_args = TrainingArguments(
        output_dir="santacoder-finetuned-the-stack-yaml",
        dataloader_drop_last=True,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        optim="adafactor",
        evaluation_strategy="steps",
        # max_steps=5000,
        # max_steps=1000,
        # eval_steps=500,
        # save_steps=500,
        max_steps=100,
        eval_steps=50,
        save_steps=50,
        logging_steps=10,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        learning_rate=5e-5,
        lr_scheduler_type="cosine",
        warmup_steps=100,
        weight_decay=0.05,
        fp16=False,
        push_to_hub=False
)

### Train

In [None]:
train_dataset.start_iteration = 0

In [None]:
trainer = Trainer(
    #tokenizer=tokenizer,
    model=model, args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=valid_dataset
)

In [None]:
trainer.train()

In [None]:
trainer.save_model() # this creates the config.json file etc you need to run generation tests