In [1]:
"""
Fine-Tune SantaCoder on code/text dataset
"""

import argparse
import os

import numpy as np
import torch
from datasets import load_dataset
from torch.utils.data import IterableDataset
from torch.utils.data.dataloader import DataLoader
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    logging,
    set_seed,
)

import fim


def chars_token_ratio(dataset, tokenizer, data_column, nb_examples=400):
    """
    Estimate the average number of characters per token in the dataset.
    """
    total_characters, total_tokens = 0, 0
    for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
        total_characters += len(example[data_column])
        total_tokens += len(tokenizer(example[data_column]).tokens())

    return total_characters / total_tokens


class ConstantLengthDataset(IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
            fim_rate (float): Rate (0.0 to 1.0) that sample will be permuted with FIM.
            fim_spm_rate (float): Rate (0.0 to 1.0) of FIM permuations that will use SPM.
            seed (int): Seed for random number generator.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=1024,
        num_of_sequences=1024,
        chars_per_token=3.6,
        content_field="content",
        fim_rate=0.5,
        fim_spm_rate=0.5,
        seed=0,
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = (
            tokenizer.eos_token_id if tokenizer.eos_token_id else args.eos_token_id
        )
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
        self.content_field = content_field
        self.fim_rate = fim_rate
        self.fim_spm_rate = fim_spm_rate
        self.seed = seed

        (
            self.suffix_tok_id,
            self.prefix_tok_id,
            self.middle_tok_id,
            self.pad_tok_id,
        ) = fim.get_fim_token_ids(self.tokenizer)
        if not self.suffix_tok_id and self.fim_rate > 0:
            print("FIM is not supported by tokenizer, disabling FIM")
            self.fim_rate = 0

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            buffer, buffer_len = [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    buffer.append(next(iterator)[self.content_field])
                    buffer_len += len(buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break
            tokenized_inputs = self.tokenizer(buffer, truncation=False)["input_ids"]
            all_token_ids = []

            np_rng = np.random.RandomState(seed=self.seed)
            for tokenized_input in tokenized_inputs:
                # optionally do FIM permutations
                if self.fim_rate > 0:
                    tokenized_input, np_rng = fim.permute(
                        tokenized_input,
                        np_rng,
                        self.suffix_tok_id,
                        self.prefix_tok_id,
                        self.middle_tok_id,
                        self.pad_tok_id,
                        fim_rate=self.fim_rate,
                        fim_spm_rate=self.fim_spm_rate,
                        truncate_or_pad=False,
                    )

                all_token_ids.extend(tokenized_input + [self.concat_token_id])
            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i : i + self.seq_length]
                if len(input_ids) == self.seq_length:
                    self.current_size += 1
                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(input_ids),
                    }

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = AutoModelForCausalLM.from_pretrained(
        "bigcode/santacoder",
        trust_remote_code=True
    )

tokenizer = AutoTokenizer.from_pretrained(
        "bigcode/santacoder", use_auth_token=True)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


In [3]:
dataset = load_dataset(
    "bigcode/the-stack-dedup",
    data_dir="data/shell",
    split="train",
    use_auth_token=True
)

dataset = dataset.train_test_split(test_size=0.005, seed=0)
train_data = dataset["train"]
valid_data = dataset["test"]
print(
    f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}"
)

chars_per_token = chars_token_ratio(train_data, tokenizer, "content")
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
train_dataset = ConstantLengthDataset(
    tokenizer,
    train_data,
    infinite=True,
    seq_length=2048,
    chars_per_token=chars_per_token,
    content_field="content",
    fim_rate=0,
    fim_spm_rate=0,
    seed=0,
)
valid_dataset = ConstantLengthDataset(
    tokenizer,
    valid_data,
    infinite=False,
    seq_length=2048,
    chars_per_token=chars_per_token,
    content_field="content",
    fim_rate=0,
    fim_spm_rate=0,
    seed=0,
)

Found cached dataset parquet (/home/saxenaya/.cache/huggingface/datasets/bigcode___parquet/bigcode--the-stack-dedup-93d9c24cd73ea2f7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached split indices for dataset at /home/saxenaya/.cache/huggingface/datasets/bigcode___parquet/bigcode--the-stack-dedup-93d9c24cd73ea2f7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-b19f929723caefde.arrow and /home/saxenaya/.cache/huggingface/datasets/bigcode___parquet/bigcode--the-stack-dedup-93d9c24cd73ea2f7/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec/cache-f7b6212aa31d7704.arrow


Size of the train set: 2225251. Size of the validation set: 11183


  0%|          | 0/400 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (3647 > 2048). Running this sequence through the model will result in indexing errors
100%|██████████| 400/400 [00:01<00:00, 352.04it/s]

The character to token ratio of the dataset is: 2.91





In [4]:
train_iterator = iter(train_dataset)

In [5]:
ret = next(train_iterator)

In [6]:
input_ids, labels = ret["input_ids"], ret["labels"]

In [7]:
print(tokenizer.decode(input_ids))

#!/bin/bash

#####################################################################################
#                                  ADS-B RECEIVER                                   #
#####################################################################################
#                                                                                   #
# This script is not meant to be executed directly.                                 #
# Instead execute install.sh to begin the installation process.                     #
#                                                                                   #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#                                                                                   #
# Copyright (c) 2015-2017, Joseph A. Prochazka                                      #
#                                                                                   #
# Permission is hereby granted, free of c

In [8]:
print(tokenizer.decode(labels))

#!/bin/bash

#####################################################################################
#                                  ADS-B RECEIVER                                   #
#####################################################################################
#                                                                                   #
# This script is not meant to be executed directly.                                 #
# Instead execute install.sh to begin the installation process.                     #
#                                                                                   #
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
#                                                                                   #
# Copyright (c) 2015-2017, Joseph A. Prochazka                                      #
#                                                                                   #
# Permission is hereby granted, free of c

In [9]:
torch.all(labels == input_ids)

tensor(True)

In [10]:
ret2 = next(train_iterator)

In [11]:
input_ids_2, labels_2 = ret2["input_ids"], ret2["labels"]

In [12]:
print(tokenizer.decode(input_ids_2))


    echo -e ""
    wget --no-check-certificate https://repo.feed.flightradar24.com/install_fr24_rpi.sh -O ${RECEIVER_BUILD_DIRECTORY}/flightradar24/install_fr24_rpi.sh
else
    # Otherwise assume i386.
    echo -e "\e[94m  Downloading the FlightRadar24 feeder client v${FLIGHTRADAR24_CLIENT_VERSION_I386} package for i386 devices...\e[97m"
    echo -e ""
    wget --no-check-certificate https://feed.flightradar24.com/linux/fr24feed_${FLIGHTRADAR24_CLIENT_VERSION_I386}_i386.deb -O ${RECEIVER_BUILD_DIRECTORY}/flightradar24/fr24feed_${FLIGHTRADAR24_CLIENT_VERSION_I386}_i386.deb
fi

## INSTALL THE COMPONENT PACKAGE

# Install the proper package depending on the devices architecture.
if [[ "${CPU_ARCHITECTURE}" = "armv7l" ]] || [[ "${CPU_ARCHITECTURE}" = "armv6l" ]] || [[ "${CPU_ARCHITECTURE}" = "aarch64" ]] ; then
    # ARM achitecture detected.
    echo -e "\e[94m  Executing the FlightRadar24 feeder client installation script...\e[97m"
    echo -e ""
    sudo bash ${RECEIVER_BUILD_DIRECTORY

In [13]:
print(tokenizer.decode(labels_2))


    echo -e ""
    wget --no-check-certificate https://repo.feed.flightradar24.com/install_fr24_rpi.sh -O ${RECEIVER_BUILD_DIRECTORY}/flightradar24/install_fr24_rpi.sh
else
    # Otherwise assume i386.
    echo -e "\e[94m  Downloading the FlightRadar24 feeder client v${FLIGHTRADAR24_CLIENT_VERSION_I386} package for i386 devices...\e[97m"
    echo -e ""
    wget --no-check-certificate https://feed.flightradar24.com/linux/fr24feed_${FLIGHTRADAR24_CLIENT_VERSION_I386}_i386.deb -O ${RECEIVER_BUILD_DIRECTORY}/flightradar24/fr24feed_${FLIGHTRADAR24_CLIENT_VERSION_I386}_i386.deb
fi

## INSTALL THE COMPONENT PACKAGE

# Install the proper package depending on the devices architecture.
if [[ "${CPU_ARCHITECTURE}" = "armv7l" ]] || [[ "${CPU_ARCHITECTURE}" = "armv6l" ]] || [[ "${CPU_ARCHITECTURE}" = "aarch64" ]] ; then
    # ARM achitecture detected.
    echo -e "\e[94m  Executing the FlightRadar24 feeder client installation script...\e[97m"
    echo -e ""
    sudo bash ${RECEIVER_BUILD_DIRECTORY

In [22]:
from torch.utils.data import Dataset

class CodeDataset(Dataset):
    def __init__(self, data_dir, tokenizer, prompt):
        self.data_dir = data_dir
        self.example_folders = os.listdir(self.data_dir)

        self.data = []

        # Number of prompts for in each folder
        prompt_lengths = []
        self.total_length = 0

        for d in self.example_folders:
            prompts_dir = os.path.join(self.data_dir, d, "prompts.txt")

            with open(prompts_dir, 'r') as f:
                 prompts = f.read()
            
            prompt_length = len(prompts.split("\n"))
            prompt_lengths.append(prompt_length)

            self.total_length += prompt_length
        
        self.prompt_indexes = [0]
        for i in prompt_lengths[:-1]:
             self.prompt_indexes.append(self.prompt_indexes[-1] + i)

        self.DSL = prompt
        self.tokenizer = tokenizer
    def __len__(self):
        return self.total_length

    def __getitem__(self, idx):
        folder_idx = 0
        for idx, val in enumerate(self.prompt_indexes):
             if idx >= val:
                  folder_idx = idx
        
        prompt_idx = idx - self.prompt_indexes[folder_idx]
                  
        code_dir = os.path.join(self.data_dir, self.example_folders[folder_idx], "code.txt")
        prompts_dir = os.path.join(self.data_dir, self.example_folders[folder_idx], "prompts.txt")

        with open(code_dir, 'r') as f:
            code = f.read()
        
        with open(prompts_dir, 'r') as f:
             prompts = f.read().split("\n")
        
        code = "\n\t".join(code.split("\n"))

        final_prompt = self.DSL + prompts[prompt_idx] + "\n\t" + code
        input_ids = self.tokenizer(
            final_prompt, padding="max_length", max_length=256)["input_ids"]
        return {"input_ids": input_ids, "labels": input_ids}

with open("DSL.txt", 'r') as f:
        prompt = f.read()

tokenizer.pad_token = "<pad>"
tokenizer.padding_side = "left"

code_dataset = CodeDataset("./train", tokenizer, prompt)

In [23]:
code_input_ids, code_labels = code_dataset[0]['input_ids'], code_dataset[0]['labels'] 

In [24]:
print(tokenizer.decode(code_input_ids))

<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'''
def go_to(location : str)
def find(object : str)
def pick_up(object : str)
def put_down(object : str)
def find(object : str)
def ask(person : str, question : str, options: Optional[List[str]])
def say(message : str)
'''
def main():
    # Using the functions defined above, write a script to do the following: Head to the living room and get the TV remote. Then, go to the kitchen to retrieve an apple. Finally, give Joydeep the TV remote and Yash the apple since they are both working in their respective offices.
	go_to("Living room")
	find("TV Remote")
	pick_up("TV Remote")
	go_to("Kitchen")
	find("apple")
	pick_up("apple")
	go_to("Joydeep's office")
	put_down("TV Remote")
	go_to("Yash's offi

In [25]:
print(tokenizer.decode(code_labels))

<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>'''
def go_to(location : str)
def find(object : str)
def pick_up(object : str)
def put_down(object : str)
def find(object : str)
def ask(person : str, question : str, options: Optional[List[str]])
def say(message : str)
'''
def main():
    # Using the functions defined above, write a script to do the following: Head to the living room and get the TV remote. Then, go to the kitchen to retrieve an apple. Finally, give Joydeep the TV remote and Yash the apple since they are both working in their respective offices.
	go_to("Living room")
	find("TV Remote")
	pick_up("TV Remote")
	go_to("Kitchen")
	find("apple")
	pick_up("apple")
	go_to("Joydeep's office")
	put_down("TV Remote")
	go_to("Yash's offi

: 