In [1]:
!export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,7

In [2]:
%%capture --no-display --no-stderr --no-stdout
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms
from torch.optim import Adam
from torch.utils.data import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
import numpy as np
import os

import pickle
from tqdm import tqdm
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    DataCollatorForLanguageModeling,
)
from generate_with_embeddings import GenerateWithEmbeddings
from utils import ConstantLengthDataset
import matplotlib.pyplot as plt
%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
with open("DSL.txt", 'r') as f:
    prompt = f.read()

In [4]:
device = "cuda"

class CodeDataset(Dataset): # NOTE: this isn't very space-efficient since it has to load in the entire dataset at once
    def __init__(self, data_dir):
        self.data_dir = data_dir

        self.data = []
        for d in os.listdir(self.data_dir):
            code_dir = os.path.join(self.data_dir, d, "code.txt")
            prompts_dir = os.path.join(self.data_dir, d, "prompts.txt")

            with open(code_dir, 'r') as f:
                code = f.read()
            
            with open(prompts_dir, 'r') as f:
                prompts = f.read()
            
            prompts = prompts.split("\n")

            for p in prompts:
                self.data.append((code, prompt + p + "\n\t"))

    def __len__(self):
        return len(os.listdir(self.data_dir))
    def __getitem__(self, idx):
        input_ids = tokenizer(self.data[idx][1], padding="max_length", max_length=4)["input_ids"]
        labels = tokenizer(self.data[idx][0], padding="max_length", max_length=4)["input_ids"]
        return {"input_ids": input_ids, "labels": labels}

In [5]:
checkpoint = "bigcode/santacoder" # "facebook/incoder-6B" # 
revision = "dedup-alt"

tokenizer = AutoTokenizer.from_pretrained(checkpoint, revision=revision)
model = AutoModelForCausalLM.from_pretrained(checkpoint, revision=revision, trust_remote_code=True).to(device)

tokenizer.pad_token = "<pad>"
tokenizer.padding_side = "left"

In [6]:
import torch


def _stop_at_stop_token(decoded_string, stop_tokens):
    """
    Produces the prefix of decoded_string that ends at the first occurrence of
    a stop_token.

    WARNING: the decoded_string *must not* include the prompt, which may have stop tokens
    itself.
    """
    min_stop_index = len(decoded_string)
    for stop_token in stop_tokens:
        stop_index = decoded_string.find(stop_token)
        if stop_index != -1 and stop_index < min_stop_index:
            min_stop_index = stop_index
    return decoded_string[:min_stop_index]


class ConstantLengthDataset(torch.utils.data.IterableDataset):
    """
    Iterable dataset that returns constant length chunks of tokens from stream of text files.
        Args:
            tokenizer (Tokenizer): The processor used for proccessing the data.
            dataset (dataset.Dataset): Dataset with text files.
            infinite (bool): If True the iterator is reset after dataset reaches end else stops.
            seq_length (int): Length of token sequences to return.
            num_of_sequences (int): Number of token sequences to keep in buffer.
            chars_per_token (int): Number of characters per token used to estimate number of tokens in text buffer.
    """

    def __init__(
        self,
        tokenizer,
        dataset,
        infinite=False,
        seq_length=4,
        num_of_sequences=4,
        chars_per_token=3.6,
        content_field="content",
    ):
        self.tokenizer = tokenizer
        self.concat_token_id = (
            tokenizer.eos_token_id if tokenizer.eos_token_id else tokenizer.encode(
                "<|endoftext|>")
        )
        self.dataset = dataset
        self.seq_length = seq_length
        self.infinite = infinite
        self.current_size = 0
        self.max_buffer_size = seq_length * chars_per_token * num_of_sequences
        self.content_field = content_field

    def __iter__(self):
        iterator = iter(self.dataset)
        more_examples = True
        while more_examples:
            input_ids_buffer, labels_buffer, buffer_len = [], [], 0
            while True:
                if buffer_len >= self.max_buffer_size:
                    break
                try:
                    input_ids_buffer.append(next(iterator)["input_ids"])
                    labels_buffer.append(next(iterator)["labels"])
                    buffer_len += len(input_ids_buffer[-1])
                except StopIteration:
                    if self.infinite:
                        iterator = iter(self.dataset)
                    else:
                        more_examples = False
                        break

            all_token_ids, all_labels = [], []

            for token_ids, labels in zip(input_ids_buffer, labels_buffer):
                all_token_ids.extend(token_ids + [self.concat_token_id])
                all_labels.extend(labels + [self.concat_token_id])

            for i in range(0, len(all_token_ids), self.seq_length):
                input_ids = all_token_ids[i: i + self.seq_length]
                labels = all_labels[i: i + self.seq_length]

                if len(input_ids) == self.seq_length:
                    self.current_size += 1

                    yield {
                        "input_ids": torch.LongTensor(input_ids),
                        "labels": torch.LongTensor(labels)
                    }

In [7]:
# from utils import ConstantLengthDataset

batch_size = 16
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False, return_tensors="pt")

train_code_dataset = CodeDataset("./train")
train_dataset = ConstantLengthDataset(tokenizer, train_code_dataset)
train_dataloader = torch.utils.data.DataLoader(train_dataset, 
                                         batch_size,
                                         collate_fn=data_collator)

In [8]:
dataset_iterator = iter(train_dataset)
res = next(dataset_iterator)

input_ids, labels = res['input_ids'], res['labels']

In [9]:
print(tokenizer.decode(input_ids))

'''
def go


In [10]:
print(tokenizer.decode(labels))

go_to("


In [9]:
model.train()
learning_rate = 2e-5

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)

In [10]:
num_epochs = 100
train_size_per_epoch = 100
step = 1

for epoch in tqdm(range(num_epochs)):
    running_loss = 0
    stats = {"correct": 0, "total": 0}

    for _ in tqdm(range(train_size_per_epoch)):
        inputs = next(iter(train_dataloader))
        gwe = GenerateWithEmbeddings(model,
                                     tokenizer,
                                     None, 
                                     None,
                                     mode="train",
                                     device=device,
                                     **inputs)
        loss = gwe.generate_step()

        if step % 50 == 0:
            print(epoch, step, loss.item(),gwe.last_predictions)
        
        step += 1
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    lr_scheduler.step()

  0%|          | 0/100 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
  0%|          | 0/100 [00:02<?, ?it/s]
  0%|          | 0/100 [00:02<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 352.00 MiB (GPU 0; 23.65 GiB total capacity; 18.73 GiB already allocated; 268.12 MiB free; 19.04 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [35]:
for _ in tqdm(range(len(train_dataloader))):
    inputs = next(iter(train_dataloader))
    gwe = GenerateWithEmbeddings(model,
                                    tokenizer,
                                    None, 
                                    None,
                                    mode="train",
                                    device=device,
                                    **inputs)

    loss = gwe.generate_step()
    break

  0%|          | 0/1 [00:02<?, ?it/s]


In [41]:
gwe.generate()

res = gwe.get_current_texts()

In [44]:
print(res[0])

'''
def go_to(location : str)
def find(object : str)
def pick_up(object : str)
def put_down(object : str)
def find(object : str)
def ask(person : str, question : str, options: Optional[List[str]])
def say(message : str)
'''
def main():
    # Using the functions defined above, write a script to do the following: Go to the living room to get the TV Remote. Then go to the kitchen to pick up an apple. Give the TV Remote to Joydeep and the apple to Yash. They are both in their respective offices.
	<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


In [51]:
example_input = tokenizer.decode(train_dataset[0]['input_ids'])

In [45]:
from transformers import AutoModelForCausalLM, AutoTokenizer

checkpoint = "bigcode/santacoder"
device = "cuda" # for GPU usage or "cpu" for CPU usage

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForCausalLM.from_pretrained(checkpoint, trust_remote_code=True).to(device)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


def print_hello_world():
    print("Hello World!")


def print_hello_


In [52]:
inputs = tokenizer.encode(example_input, return_tensors="pt").to(device)
outputs = model.generate(inputs)
print(tokenizer.decode(outputs[0]))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 145, but `max_length` is set to 20. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


'''
def go_to(location : str)
def find(object : str)
def pick_up(object : str)
def put_down(object : str)
def find(object : str)
def ask(person : str, question : str, options: Optional[List[str]])
def say(message : str)
'''
def main():
    # Using the functions defined above, write a script to do the following: Go to the living room to get the TV Remote. Then go to the kitchen to pick up an apple. Give the TV Remote to Joydeep and the apple to Yash. They are both in their respective offices.
	



In [11]:
from datasets import load_dataset

access_token = "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"
lua_data = load_dataset("bigcode/the-stack-smol", data_dir="data/lua",split="train", use_auth_token=access_token)
lua_data.shuffle()
lua_data = lua_data.train_test_split(test_size=0.1)
train_data = lua_data["train"]

Found cached dataset json (/home/saxenaya/.cache/huggingface/datasets/bigcode___json/bigcode--the-stack-smol-88fa5373c749e3eb/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [13]:
dataset_iterator = iter(train_data)

In [14]:
next_item = next(dataset_iterator)

In [16]:
next_item['content']

'local lfs = require("lib.lfs_ffi")\nlocal nfd = require("nfd")\nlocal physfs = require("lib.physfs")\nlocal requireUtils = require("utils.require")\nlocal threadHandler = require("utils.threads")\n\nlocal hasRequest, request = requireUtils.tryrequire("lib.luajit-request.luajit-request")\n\nlocal filesystem = {}\n\nfilesystem.supportWindowsInThreads = love.system.getOS() ~= "OS X"\n\nfunction filesystem.filename(path, sep)\n    sep = sep or physfs.getDirSeparator()\n\n    return path:match("[^" .. sep .. "]+$")\nend\n\nfunction filesystem.dirname(path, sep)\n    sep = sep or physfs.getDirSeparator()\n\n    return path:match("(.*" .. sep .. ")")\nend\n\nfunction filesystem.joinpath(...)\n    local paths = {...}\n    local sep = physfs.getDirSeparator()\n\n    return table.concat(paths, sep):gsub(sep .. sep, sep)\nend\n\nfunction filesystem.splitpath(s)\n    local sep = physfs.getDirSeparator()\n\n    return string.split(s, sep)()\nend\n\nfunction filesystem.samePath(path1, path2)\n    l

In [19]:
example_train_iterator = iter(train_dataset)
next_train_item = next(example_train_iterator)

In [20]:
next_train_item

{'input_ids': tensor([    2,  3013,   205,  1091,   227,  1012,    69,   306,    14,  2539,
           227,    32,   227,   410,    15,   205,  1091,   227,  2479,    14,
           955,   227,    32,   227,   410,    15,   205,  1091,   227,  2875,
            69,   400,    14,   955,   227,    32,   227,   410,    15,   205,
          1091,   227,   568,    69,  1413,    14,   955,   227,    32,   227,
           410,    15,   205,  1091,   227,  2479,    14,   955,   227,    32,
           227,   410,    15,   205,  1091,   227,   697,    14,  7665,   227,
            32,   227,   410,    18,   227,  4740,   227,    32,   227,   410,
            18,   227,  9260,    32,   227,  2882,    65,  1241,    65,   410,
         12988,   205,  1091,   227, 14658,    14,  1180,   227,    32,   227,
           410,    15,   205,  3013,   205,  1091,   227,  1203,  1442,   205,
           502, 16678,   227,    59,  1822,   227,   733,   227, 15772,   227,
         16896,   227, 10825,    18,   