# Fine-tune SantaCoder on YAML
Sources:
* https://github.com/loubnabnl/santacoder-finetuning
* https://colab.research.google.com/drive/1UMjeXHwOldpLnWjdm1499o2IYy0RgeTw?usp=sharing



### Install required libraries

In [None]:
! pip install transformers datasets torch

### Log in to HuggingFace to access The Stack

In [None]:
import os
from huggingface_hub import login
# Note: Prefer notebook_login() but this was not prompting properly
login(token=os.environ.get('HUGGINGFACE_TOKEN'))

### Import required libraries

In [None]:
import torch
from dataclasses import dataclass
from datasets import load_dataset
from torch.utils.data import IterableDataset
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed
)
from dataset import ConstantLengthDataset

### Specify the original pre-trained model and fine-tuned model

In [None]:
pretrained_id = "bigcode/santacoder"
model_id = "santacoder-finetuned-the-stack-yaml"

### Load the tokenizer

In [None]:
tokenizer = AutoTokenizer.from_pretrained(pretrained_id)

### Load the model

In [None]:
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, use_cache=False)

### Create a minimal Ansible dataset from text file

In [None]:
dataset_dict = load_dataset("text", data_files="ansible-examples-ec2.txt", sample_by="paragraph")
dataset = dataset_dict['train']

### Calculate characters per token

In [None]:
examples, total_characters, total_tokens = 500, 0, 0

for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):
    total_characters += len(example['text'])
    total_tokens += len(tokenizer(example['text']).tokens())

characters_per_token = total_characters / total_tokens
print(characters_per_token)

### Create a test split

In [None]:
dataset = dataset.train_test_split(test_size=0.1)
train_ds = dataset["train"]
valid_ds = dataset["test"]
len(train_ds), len(valid_ds)

In [None]:
train_ds = train_ds.shuffle(seed=555)

train_dataset = ConstantLengthDataset(
        tokenizer, train_ds, infinite=True, seq_length=1024, chars_per_token=characters_per_token, content_field="text" 
    )
valid_dataset = ConstantLengthDataset(
        tokenizer, valid_ds, infinite=False, seq_length=1024, chars_per_token=characters_per_token, content_field="text"  
    )

### Define the training arguments

In [None]:
training_args = TrainingArguments(
        output_dir="santacoder-finetuned-the-stack-ansible-ec2",
        dataloader_drop_last=True,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        optim="adafactor",
        evaluation_strategy="steps",
#        max_steps=5000,
        max_steps=1000,
        eval_steps=500,
        save_steps=500,
        logging_steps=10,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        learning_rate=5e-5,
        lr_scheduler_type="cosine",
        warmup_steps=100,
        weight_decay=0.05,
        fp16=False,
        push_to_hub=False
)

### Train

In [None]:
train_dataset.start_iteration = 0

In [None]:
trainer = Trainer(
    #tokenizer=tokenizer,
    model=model, args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=valid_dataset
)

In [None]:
trainer.train()

In [None]:
trainer.save_model() # this creates the config.json file etc you need to run generation tests