# Fine-tune SantaCoder on YAML
Sources:
* https://github.com/loubnabnl/santacoder-finetuning
* https://colab.research.google.com/drive/1UMjeXHwOldpLnWjdm1499o2IYy0RgeTw?usp=sharing


### Install required libraries

In [7]:
! pip install transformers datasets ipywidgets "torch>2.2" accelerate flash_attn 

Collecting torch>2.2
  Downloading torch-2.3.1-cp39-cp39-manylinux1_x86_64.whl.metadata (26 kB)
Collecting flash_attn
  Downloading flash_attn-2.5.9.post1.tar.gz (2.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting nvidia-nccl-cu12==2.20.5 (from torch>2.2)
  Downloading nvidia_nccl_cu12-2.20.5-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting triton==2.3.1 (from torch>2.2)
  Downloading triton-2.3.1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.4 kB)
Collecting einops (from flash_attn)
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading torch-2.3.1-cp39-cp39-manylinux1_x86_64.whl (779.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m779.1/779.1 MB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading nvidia_nccl_cu12-2.20.5-py3-non

### Log in to HuggingFace to access The Stack

In [8]:
import os
from huggingface_hub import login
# Note: Prefer notebook_login() but this was not prompting properly
login(token=os.environ.get('HUGGINGFACE_TOKEN'))

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to /opt/app-root/src/.cache/huggingface/token
Login successful


### Import required libraries

In [9]:
import torch
from dataclasses import dataclass
from datasets import load_dataset
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed
)
from dataset import ConstantLengthDataset

### Specify the pre-trained model and dataset

In [10]:
model_id = "bigcode/santacoder"
dataset_id = "bigcode/the-stack-dedup"
data_dir = "data/yaml"

### Load the tokenizer

In [11]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Load the model

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
    use_cache=False)

### Load the dataset
Use the first 10% of the dataset

In [None]:
dataset = load_dataset(dataset_id, data_dir=data_dir, split='train[:10%]')

### Calculate characters per token

In [None]:
examples, total_characters, total_tokens = 500, 0, 0

for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):
    total_characters += len(example['content'])
    total_tokens += len(tokenizer(example['content']).tokens())

characters_per_token = total_characters / total_tokens
print(characters_per_token)

### Create a test split

In [None]:
dataset = dataset.train_test_split(test_size=0.005, shuffle=False, seed=555)
train_ds = dataset["train"]
valid_ds = dataset["test"]
len(train_ds), len(valid_ds)

In [None]:
train_ds = train_ds.shuffle(seed=555)

train_dataset = ConstantLengthDataset(
        tokenizer, train_ds, infinite=True, seq_length=1024, chars_per_token=characters_per_token 
    )
valid_dataset = ConstantLengthDataset(
        tokenizer, valid_ds, infinite=False, seq_length=1024, chars_per_token=characters_per_token
    )

### Define the training arguments

In [None]:
training_args = TrainingArguments(
        output_dir="santacoder-finetuned-the-stack-yaml",
        dataloader_drop_last=True,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        optim="adafactor",
        evaluation_strategy="steps",
#        max_steps=5000,
        max_steps=1000,
        eval_steps=500,
        save_steps=500,
        logging_steps=10,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        learning_rate=5e-5,
        lr_scheduler_type="cosine",
        warmup_steps=100,
        weight_decay=0.05,
        fp16=False,
        push_to_hub=False
)

### Train

In [None]:
train_dataset.start_iteration = 0

In [None]:
trainer = Trainer(
    #tokenizer=tokenizer,
    model=model, args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=valid_dataset
)

In [None]:
trainer.train()

In [None]:
trainer.save_model() # this creates the config.json file etc you need to run generation tests