# Fine-tune Santacoder on BASH/SHELL (The Stack)

You will need at least a 16GB GPU!

## Install the required libaries

In [1]:
!pip install transformers git+https://github.com/huggingface/transformers.git
!pip install accelerate git+https://github.com/huggingface/accelerate.git

Collecting git+https://github.com/huggingface/transformers.git
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-req-build-3t9s2j7_
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-3t9s2j7_
  Resolved https://github.com/huggingface/transformers.git to commit 0fd8d2aa2cc9e172a8af9af8508b2530f55ca14c
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Collecting git+https://github.com/huggingface/accelerate.git
  Cloning https://github.com/huggingface/accelerate.git to /tmp/pip-req-build-4aq5p5lb
  Running command git clone

## Log in the HF Hub

In [2]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
!pip install datasets

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.2.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


## Import the required libraries

In [4]:
import torch
from dataclasses import dataclass
from datasets import load_dataset
from torch.utils.data import IterableDataset
from torch.utils.data.dataloader import DataLoader
from tqdm.notebook import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed
)

## Set the model and dataset ids

In [22]:
#model_id = "bigcode/tiny_starcoder_py"
model_id = "bigcode/santacoder"
dataset_id = "thanhnew2001/taipycode"
data_dir = ""

## Load the model and the tokenizer

In [23]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [24]:
model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, use_cache=False)

## Load the dataset
For the shake of simplicity we will only use the first 10% of the dataset

In [25]:
dataset = load_dataset(dataset_id, data_dir=data_dir, split='train[:100%]')

Downloading readme:   0%|          | 0.00/101 [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

## Let's calculate the characters per tokens

In [26]:
examples, total_characters, total_tokens = 500, 0, 0

for _, example in tqdm(zip(range(examples), iter(dataset)), total=examples):
    total_characters += len(example['content'])
    total_tokens += len(tokenizer(example['content']).tokens())

characters_per_token = total_characters / total_tokens
print(characters_per_token)

  0%|          | 0/500 [00:00<?, ?it/s]

2.7291867263576894


## Create a test split to eval our model

In [27]:
dataset = dataset.train_test_split(test_size=0.005, shuffle=False, seed=555)

In [28]:
dataset

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 3629
    })
    test: Dataset({
        features: ['content'],
        num_rows: 19
    })
})

In [29]:
print(dataset['train'][0]['content'])

<|Example Instructions|expandable|expanded=True|
<|What are the 5 most profitable cities?|button|on_action=example1|>

<|Plot sales by product line in a pie chart|button|on_action=example3|>

<|Plot in a bar chart sales of the 5 most profitable cities, sorted descending, with ylabel 'Sales ($)'|button|on_action=example2|>
|>

<|{user_input}|input|on_action=modify_data|class_name=fullwidth|change_delay=1000|label=Enter your instruction here|>

<center>
<|{content}|image|width=50%|>
</center>

<|Dataset|expandable|expanded=True|
<|{data}|table|width=100%|page_size=5|rebuild|>
|>

<|Reset Data|button|on_action=reset_data|>


In [30]:
train_ds = dataset["train"]
valid_ds = dataset["test"]

In [31]:
len(train_ds), len(valid_ds)

(3629, 19)

## Define the dataset and create the dataloaders

In [32]:
class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=2.95,
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = tokenizer.eos_token_id if tokenizer.eos_token_id else 49152
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
        self.content_field = "content"

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(next(iterator)[self.content_field])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []
            for tokenized_input in tokenized_inputs:
                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }

In [33]:
train_ds = train_ds.shuffle(seed=555)

train_dataset = ConstantLengthDataset(
        tokenizer, train_ds, infinite=True, seq_length=1024
    )
valid_dataset = ConstantLengthDataset(
        tokenizer, valid_ds, infinite=False, seq_length=1024
    )

In [34]:
next(iter(train_dataset))

{'input_ids': tensor([  27,   91, 2131,  ...,   91, 2131,  207]),
 'labels': tensor([  27,   91, 2131,  ...,   91, 2131,  207])}

## Define the training arguments

In [35]:
training_args = TrainingArguments(
        output_dir="santacoder-finetuned-the-stack-bash",
        dataloader_drop_last=True,
        gradient_checkpointing=True,
        gradient_accumulation_steps=4,
        optim="adafactor",
        evaluation_strategy="steps",
        max_steps=10,
        eval_steps=10,
        save_steps=10,
        logging_steps=10,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        learning_rate=5e-5,
        lr_scheduler_type="cosine",
        warmup_steps=1,
        weight_decay=0.05,
        fp16=False,
        push_to_hub=False
)

## Create the Trainer object and start training

In [36]:
train_dataset.start_iteration = 0

In [37]:
trainer = Trainer(
    #tokenizer=tokenizer,
    model=model, args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset
)

In [38]:
trainer.train()

Step,Training Loss,Validation Loss
10,1.0111,0.079257


TrainOutput(global_step=10, training_loss=1.0111072540283204, metrics={'train_runtime': 34.6483, 'train_samples_per_second': 1.154, 'train_steps_per_second': 0.289, 'total_flos': 250617884835840.0, 'train_loss': 1.0111072540283204, 'epoch': 1.0})

In [39]:
model.save_pretrained('finetuned')
tokenizer.save_pretrained('finetuned')

('finetuned/tokenizer_config.json',
 'finetuned/special_tokens_map.json',
 'finetuned/vocab.json',
 'finetuned/merges.txt',
 'finetuned/added_tokens.json',
 'finetuned/tokenizer.json')

In [40]:
#do some inference on this new model
finetuned_model = 'finetuned'

In [45]:
tokenizer2 = AutoTokenizer.from_pretrained(finetuned_model)
model2 = AutoModelForCausalLM.from_pretrained(finetuned_model, trust_remote_code=True, use_cache=False, device_map={"":0})

In [46]:
#now infer
# Sane hyper-parameters
params = {
    'max_new_tokens': 128,
    'temperature': 0.2,
    'top_k': 50,
    'top_p': 0.1,
    'repetition_penalty': 1.17
}

# Prompt Style 1: Function Signature
inputs = tokenizer2.encode("<|Example Instructions|expandable|expanded=True|", return_tensors="pt").to("cuda:0")
outputs = model2.generate(inputs, pad_token_id=tokenizer.eos_token_id, **params)
print()
print(tokenizer.decode(outputs[0]))
print()


<|Example Instructions|expandable|expanded=True|
<|{user_input}|input|on_action=example2|><|endoftext|>



In [48]:
# Prompt Style 1: Function Signature
inputs = tokenizer2.encode("Instruction for row 4 chart|button|on_action=example11", return_tensors="pt").to("cuda:0")
outputs = model2.generate(inputs, pad_token_id=tokenizer.eos_token_id, **params)
print()
print(tokenizer.decode(outputs[0]))
print()


Instruction for row 4 chart|button|on_action=example11|>

<|{user_input}|input|on_action=modify_data|class_name=fullwidth|change_delay=1000|label=Enter your instruction for row 5 here|>

<|Row 5 Instructions|expandable|expanded=True|
<|{content}|image|width=50%|>
</|Row 5 Instructions|expandable|expanded=True|
<|{content}|image|width=50%|>
</|Row 5 Instructions|expandable|expanded=True|
<|{content}|image|



In [47]:
# Prompt Style 1: Function Signature
inputs = tokenizer2.encode("write javascript code to display a calendar", return_tensors="pt").to("cuda:0")
outputs = model2.generate(inputs, pad_token_id=tokenizer.eos_token_id, **params)
print()
print(tokenizer.decode(outputs[0]))
print()


write javascript code to display a calendar for the given date
 * @param {string} date - Date in format YYYY-MM-DD
 */
function showCalendar(date) {
    // get the current year and month from the input field
    var year = document.getElementById("year").value;
    var month = document.getElementById("month").value;

    // create a new date object with the given date string
    var d = new Date();
    d.setFullYear(parseInt(year));
    d.setMonth(parseInt(month)-1);
    d.setDate(1);
    
    // set the initial day of week to Monday,

